Spaces:

xu-song
/

tokenizer-arena

Running

App Files Files Community

xu-song commited on 7 days ago

Commit

4427304

1 Parent(s): bb48dd8

update

Browse files

Files changed (3) hide show

playground_examples.py +138 -113
playground_util.py +63 -30
vocab.py +1 -1

playground_examples.py CHANGED Viewed

@@ -1,113 +1,138 @@
-"""
-## characters
-- alphanumeric characters
-- numeric characters
-- special characters: A special character is a character that is not an alphabetic or numeric character.
-    - ASCII control characters
-    - punctuation marks
-    - accent marks
-    - 数学符号
-    - whitespace:
-        - https://en.wikipedia.org/wiki/Whitespace_character
-        - https://emptycharacter.com/
-https://www.computerhope.com/jargon/s/specchar.htm
-"""
-import random
-from datasets import load_dataset
-default_user_input = """\
-Replace this text in the input field to see how tokenization works.
-Buenos días!
-华为发布Mate60手机。
-ラグビーワールドカップ2023フランス"""
-# default_tokenizer_name_1 = "Meta/llama3"
-default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
-default_tokenizer_name_2 = "openai/gpt-4o"
-def get_sample_input():
-    default_inputs = {
-        "en": "Replace this text in the input field to see how tokenization works.",
-        "zh-Hans": "",
-        "es": "",
-        "de": "",
-    }
-    random.seed(10)  # For reproducibility
-    lines = []
-    for lang in default_inputs.keys():
-        dataset = load_dataset("eson/cc100-samples", lang, split="train")
-        print(dataset)
-        print(1)
-    return default_inputs
-examples = {
-    "en": [
-        ["number: (10086 + 98) = 100184", "huggyllama/llama-7b", "bigscience/bloom"],  #
-        ["whitespace:  2spaces        8spaces\t1tab\t\t2tab\n1newline", "huggyllama/llama-7b", "google-bert/bert-base-cased"],  # chatglm 有blank_n, bert丢掉了空格，
-        # ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
-        ["punctuation: ,.:/?+=\"，。！？；【】〔〕〖〗", "google/gemma-7b", "huggyllama/llama-7b"],  # llama词典有点小
-        ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan-inc/Baichuan-7B", "huggyllama/llama-7b"],
-        # ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|system|> <|user|> <|assistant|> <|endoftext|>", "", ""],
-    ],
-    "zh": [
-        ["空格测试：  2个空格        8个空格", "llama", "chatglm2_6b"],  # chatglm 有blank_n,
-        ["标点测试：，。！？；", "baichuan_7b", "llama"],
-        ["符号测试：🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
-        ["数字测试：(10086 + 98) = 100184", "baichuan_7b", "llama"],
-        ["中文简体：宽带，繁体：樂來", "baichuan_7b", "llama"],
-    ]
-}
-more_examples = [
-    # bert系列
-    ("google-bert/bert-base-cased", "google-bert/bert-base-uncased", "", ""),  # # clue VS kplug， bert VS clue
-    ("bert-base-cased", "clue", "", "增加了[]()"),
-    ("roberta-chinese-clue", "kplug", "", ""),
-    # llama系列 (基于sentencepiece)
-    ("baichuan", "baichuan2", "baichuan2支持多空格   ，多个换行\n\n\n，do not add dummy prefix as Baichuan1"),
-    ("llama", "baichuan2", "baichuan2支持多空格   ，多个换行\n\n"),
-    ("llama", "chinese-llama-2-7b", ""),
-    ("llama", "llama3", "扩充词典"),
-    ("chinese-llama-lora-7b", "chinese-llama-2-7b", ""),
-    # glm系列 （基于sentencepiece）
-    ("glm", "chatglm1", ""),
-    ("chatglm1", "chatglm2", ""),
-    # gpt2系列
-    ("gpt2", "moss", ""),
-    ("", "", ""),
-    # openai系列 （tiktoken）
-    ("qwen", "gpt_35_turbo", ""),
-]
-lang = "en"
-example_types = [t[0].split(":")[0] for t in examples[lang]]
-def example_fn(example_idx):
-    return examples[lang][example_idx]
-def get_more_example():
-    import urllib.parse
-    url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena"
-    for tokenizer1, tokenizer2, text, comment in more_examples:
-        full_url = f'{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}'
-        print(full_url)
-if __name__ == "__main__":
-    get_more_example()

+"""
+## characters
+- alphanumeric characters
+- numeric characters
+- special characters: A special character is a character that is not an alphabetic or numeric character.
+    - ASCII control characters
+    - punctuation marks
+    - accent marks
+    - 数学符号
+    - whitespace:
+        - https://en.wikipedia.org/wiki/Whitespace_character
+        - https://emptycharacter.com/
+https://www.computerhope.com/jargon/s/specchar.htm
+"""
+import random
+from datasets import load_dataset
+default_user_input = """\
+Replace this text in the input field to see how tokenization works.
+Buenos días!
+华为发布Mate60手机。
+ラグビーワールドカップ2023フランス"""
+# default_tokenizer_name_1 = "Meta/llama3"
+# default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
+default_tokenizer_name_1 = "deepseek-ai/DeepSeek-R1"
+default_tokenizer_name_2 = "openai/gpt-4o"
+def get_sample_input():
+    default_inputs = {
+        "en": "Replace this text in the input field to see how tokenization works.",
+        "zh-Hans": "",
+        "es": "",
+        "de": "",
+    }
+    random.seed(10)  # For reproducibility
+    lines = []
+    for lang in default_inputs.keys():
+        dataset = load_dataset("eson/cc100-samples", lang, split="train")
+        print(dataset)
+        print(1)
+    return default_inputs
+examples = {
+    "en": [
+        ["number: (10086 + 98) = 100184", "huggyllama/llama-7b", "bigscience/bloom"],  #
+        [
+            "whitespace:  2spaces        8spaces\t1tab\t\t2tab\n1newline",
+            "huggyllama/llama-7b",
+            "google-bert/bert-base-cased",
+        ],  # chatglm 有blank_n, bert丢掉了空格，
+        # ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
+        [
+            'punctuation: ,.:/?+="，。！？；【】〔〕〖〗',
+            "google/gemma-7b",
+            "huggyllama/llama-7b",
+        ],  # llama词典有点小
+        [
+            "symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤",
+            "baichuan-inc/Baichuan-7B",
+            "huggyllama/llama-7b",
+        ],
+        # ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|system|> <|user|> <|assistant|> <|endoftext|>", "", ""],
+    ],
+    "zh": [
+        [
+            "空格测试：  2个空格        8个空格",
+            "llama",
+            "chatglm2_6b",
+        ],  # chatglm 有blank_n,
+        ["标点测试：，。！？；", "baichuan_7b", "llama"],
+        [
+            "符号测试：🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤",
+            "baichuan_7b",
+            "llama",
+        ],
+        ["数字测试：(10086 + 98) = 100184", "baichuan_7b", "llama"],
+        ["中文简体：宽带，繁体：樂來", "baichuan_7b", "llama"],
+    ],
+}
+more_examples = [
+    # bert系列
+    (
+        "google-bert/bert-base-cased",
+        "google-bert/bert-base-uncased",
+        "",
+        "",
+    ),  # # clue VS kplug， bert VS clue
+    ("bert-base-cased", "clue", "", "增加了[]()"),
+    ("roberta-chinese-clue", "kplug", "", ""),
+    # llama系列 (基于sentencepiece)
+    (
+        "baichuan",
+        "baichuan2",
+        "baichuan2支持多空格   ，多个换行\n\n\n，do not add dummy prefix as Baichuan1",
+    ),
+    ("llama", "baichuan2", "baichuan2支持多空格   ，多个换行\n\n"),
+    ("llama", "chinese-llama-2-7b", ""),
+    ("llama", "llama3", "扩充词典"),
+    ("chinese-llama-lora-7b", "chinese-llama-2-7b", ""),
+    # glm系列 （基于sentencepiece）
+    ("glm", "chatglm1", ""),
+    ("chatglm1", "chatglm2", ""),
+    # gpt2系列
+    ("gpt2", "moss", ""),
+    ("", "", ""),
+    # openai系列 （tiktoken）
+    ("qwen", "gpt_35_turbo", ""),
+]
+lang = "en"
+example_types = [t[0].split(":")[0] for t in examples[lang]]
+def example_fn(example_idx):
+    return examples[lang][example_idx]
+def get_more_example():
+    import urllib.parse
+    url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena"
+    for tokenizer1, tokenizer2, text, comment in more_examples:
+        full_url = f"{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}"
+        print(full_url)
+if __name__ == "__main__":
+    get_more_example()

playground_util.py CHANGED Viewed

@@ -6,18 +6,24 @@ from vocab import tokenizer_factory
 from character_util import iter_vocab
 from utils.log_util import logger
 from utils.i18n_util import get_lang
-from playground_examples import default_tokenizer_name_1, default_tokenizer_name_2, default_user_input
 from functools import lru_cache
 @lru_cache
 def _tokenize(
-        text: str,
-        tokenizer_name: str,
-        color_num: int = 5,
-        add_special_token: bool = False
 ):
-    logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
     pos_tokens = []
     tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
     if add_special_token:
@@ -28,19 +34,31 @@ def _tokenize(
     table = []
     for idx, token_id in enumerate(encoding):
-        decoded_text = tokenizer.decode([token_id])  # 特殊字符解码后会统一变成 �，对应 "\ufffd"
         pos_tokens.extend([(decoded_text, str(idx % color_num))])
         # token  "Byte":  # 这是 utf-8编码吧？
-        token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
         if isinstance(token, bytes):
             try:
                 token_str = token.decode("utf-8")
             except:
                 token_str = token.decode("utf-8", errors="ignore")
-                logger.error(f"{idx}: decode_error: " + json.dumps(  # gpt_35_turbo 经常有token会decode error，这里用来记录一下
-                    {"tokenizer_type": tokenizer_name, "token": str(token), "token_str": token_str},
-                    ensure_ascii=False))
             token_bytes = token
             # json_dumps = json.dumps(token_str)
@@ -49,8 +67,12 @@ def _tokenize(
             token_bytes = bytes(token_str, "utf-8")
             # json_dumps = json.dumps(token_str)
         else:
-            logger.error(f"{idx}: wrong type for token {token_id} {type(token)} " + json.dumps(
-                {"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
             token_str = token
             token_bytes = token
             # continue
@@ -58,13 +80,14 @@ def _tokenize(
         # ⭐
         # TODO: gpt3.5_turbo错误： 只有id和text是对的，token和 utf8都是错的。说明 convert_ids_to_tokens 出错了。
         table.append(
-            {"TokenID": token_id,
-             "Token": token_str,  # utf-8解码后的字符串，为什么有些是 <0xE7>，表示什么？比如llama
-             "Text": decoded_text,  #
-             # "Bytes": token_bytes,  # bytes类型在gradio前端页面被解码成字符串，比如   b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
-             "UTF8 Bytes": str(token_bytes),
-             # "Unicode": json_dumps  # unicode, 如果是ascii码，就直接显示。如果不是ascii码，就显示unicode
-             }
         )
     table_df = pd.DataFrame(table)
@@ -73,15 +96,14 @@ def _tokenize(
 def tokenize(
-        text: str,
-        tokenizer_name: str,
-        color_num: int = 5,
-        add_special_token: bool = False
 ):
-    """ tokenize wrapper
     As gr.Update would be overwritten after passing to frontend, we apply lru_cache in _tokenize.
     """
-    pos_tokens, num_tokens, table_df = _tokenize(text, tokenizer_name, color_num, add_special_token)
     return gr.update(value=pos_tokens, label=f"Tokens: {num_tokens}"), table_df
@@ -97,7 +119,7 @@ def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
 @lru_cache
 def basic_count(tokenizer_name):
     stats = iter_vocab(tokenizer_name)
-    return stats['vocab_size'], f'{stats["organization"]}'
     # return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
@@ -125,9 +147,14 @@ def get_overlap_token_size(tokenizer_name_1, tokenizer_name_2):
             vocab_set_2 = set([token.encode("utf-8") for token in vocab_set_2])
     overlap_tokens = vocab_set_1 & vocab_set_2
     overlap_token_size = len(overlap_tokens)
     logger.info(
-        f"{overlap_token_size} OverlapTokens of {tokenizer_name_1} {tokenizer_name_2}: {list(overlap_tokens)[:10]}")
     return overlap_token_size, overlap_token_size
@@ -166,10 +193,16 @@ def on_load(url_params, request: gr.Request):
 def test_coding():
-    bytes1 = b'\xe4\xb8\xad'
     print(bytes1)  # b'\xe4\xb8\xad'
 if __name__ == "__main__":
-    print(get_overlap_token_size("gpt-35-turbo", "gpt-4"))
     # print(basic_count("internlm_chat_7b"))

 from character_util import iter_vocab
 from utils.log_util import logger
 from utils.i18n_util import get_lang
+from playground_examples import (
+    default_tokenizer_name_1,
+    default_tokenizer_name_2,
+    default_user_input,
+)
 from functools import lru_cache
 @lru_cache
 def _tokenize(
+    text: str, tokenizer_name: str, color_num: int = 5, add_special_token: bool = False
 ):
+    logger.info(
+        "param="
+        + json.dumps(
+            {"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False
+        )
+    )
     pos_tokens = []
     tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
     if add_special_token:
     table = []
     for idx, token_id in enumerate(encoding):
+        decoded_text = tokenizer.decode(
+            [token_id]
+        )  # 特殊字符解码后会统一变成 �，对应 "\ufffd"
         pos_tokens.extend([(decoded_text, str(idx % color_num))])
         # token  "Byte":  # 这是 utf-8编码吧？
+        token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[
+            0
+        ]
         if isinstance(token, bytes):
             try:
                 token_str = token.decode("utf-8")
             except:
                 token_str = token.decode("utf-8", errors="ignore")
+                logger.error(
+                    f"{idx}: decode_error: "
+                    + json.dumps(  # gpt_35_turbo 经常有token会decode error，这里用来记录一下
+                        {
+                            "tokenizer_type": tokenizer_name,
+                            "token": str(token),
+                            "token_str": token_str,
+                        },
+                        ensure_ascii=False,
+                    )
+                )
             token_bytes = token
             # json_dumps = json.dumps(token_str)
             token_bytes = bytes(token_str, "utf-8")
             # json_dumps = json.dumps(token_str)
         else:
+            logger.error(
+                f"{idx}: wrong type for token {token_id} {type(token)} "
+                + json.dumps(
+                    {"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False
+                )
+            )
             token_str = token
             token_bytes = token
             # continue
         # ⭐
         # TODO: gpt3.5_turbo错误： 只有id和text是对的，token和 utf8都是错的。说明 convert_ids_to_tokens 出错了。
         table.append(
+            {
+                "TokenID": token_id,
+                "Token": token_str,  # utf-8解码后的字符串，为什么有些是 <0xE7>，表示什么？比如llama
+                "Text": decoded_text,  #
+                # "Bytes": token_bytes,  # bytes类型在gradio前端页面被解码成字符串，比如   b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
+                "UTF8 Bytes": str(token_bytes),
+                # "Unicode": json_dumps  # unicode, 如果是ascii码，就直接显示。如果不是ascii码，就显示unicode
+            }
         )
     table_df = pd.DataFrame(table)
 def tokenize(
+    text: str, tokenizer_name: str, color_num: int = 5, add_special_token: bool = False
 ):
+    """tokenize wrapper
     As gr.Update would be overwritten after passing to frontend, we apply lru_cache in _tokenize.
     """
+    pos_tokens, num_tokens, table_df = _tokenize(
+        text, tokenizer_name, color_num, add_special_token
+    )
     return gr.update(value=pos_tokens, label=f"Tokens: {num_tokens}"), table_df
 @lru_cache
 def basic_count(tokenizer_name):
     stats = iter_vocab(tokenizer_name)
+    return stats["vocab_size"], f'{stats["organization"]}'
     # return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
             vocab_set_2 = set([token.encode("utf-8") for token in vocab_set_2])
     overlap_tokens = vocab_set_1 & vocab_set_2
+    # TODO: visualize the add_tokens, del_tokens in Venn diagram
+    # TODO: visualzie the add_tokens, del_tokens in git diff
+    # add_tokens = [token for token in vocab_set_2 if token not in vocab_set_1]
+    # del_tokens = [token for token in vocab_set_1 if token not in vocab_set_2]
     overlap_token_size = len(overlap_tokens)
     logger.info(
+        f"{overlap_token_size} OverlapTokens of {tokenizer_name_1} {tokenizer_name_2}: {list(overlap_tokens)[:10]}"
+    )
     return overlap_token_size, overlap_token_size
 def test_coding():
+    bytes1 = b"\xe4\xb8\xad"
     print(bytes1)  # b'\xe4\xb8\xad'
 if __name__ == "__main__":
+    # print(get_overlap_token_size("gpt-35-turbo", "gpt-4"))
+    print(
+        get_overlap_token_size(
+            "gradientai/Llama-3-8B-Instruct-Gradient-1048k", "deepseek-ai/DeepSeek-R1"
+        )
+    )
     # print(basic_count("internlm_chat_7b"))

vocab.py CHANGED Viewed

@@ -378,7 +378,7 @@ _all_tokenizer_config = [
     TokenizerConfig("deepseek-ai/deepseek-llm-7b-base", org="DeepSeek"),
     TokenizerConfig("deepseek-ai/DeepSeek-V2", org="DeepSeek"),
     TokenizerConfig("deepseek-ai/DeepSeek-V3", org="DeepSeek"),
-    TokenizerConfig("deepseek-ai/DeepSeek-R1", org="DeepSeek"),
     TokenizerConfig("deepseek-ai/DeepSeek-R1-Zero", org="DeepSeek"),
     TokenizerConfig("deepseek-ai/DeepSeek-R1-Distill-Llama-70B", org="DeepSeek"),

     TokenizerConfig("deepseek-ai/deepseek-llm-7b-base", org="DeepSeek"),
     TokenizerConfig("deepseek-ai/DeepSeek-V2", org="DeepSeek"),
     TokenizerConfig("deepseek-ai/DeepSeek-V3", org="DeepSeek"),
+    TokenizerConfig("deepseek-ai/DeepSeek-R1", org="DeepSeek"),  # 在llama3的词典上，增加了一些中文token，删掉了一部分token
     TokenizerConfig("deepseek-ai/DeepSeek-R1-Zero", org="DeepSeek"),
     TokenizerConfig("deepseek-ai/DeepSeek-R1-Distill-Llama-70B", org="DeepSeek"),