xu-song commited on
Commit
4427304
·
1 Parent(s): bb48dd8
Files changed (3) hide show
  1. playground_examples.py +138 -113
  2. playground_util.py +63 -30
  3. vocab.py +1 -1
playground_examples.py CHANGED
@@ -1,113 +1,138 @@
1
- """
2
-
3
- ## characters
4
-
5
- - alphanumeric characters
6
- - numeric characters
7
- - special characters: A special character is a character that is not an alphabetic or numeric character.
8
- - ASCII control characters
9
- - punctuation marks
10
- - accent marks
11
- - 数学符号
12
- - whitespace:
13
- - https://en.wikipedia.org/wiki/Whitespace_character
14
- - https://emptycharacter.com/
15
-
16
-
17
- https://www.computerhope.com/jargon/s/specchar.htm
18
- """
19
- import random
20
- from datasets import load_dataset
21
-
22
- default_user_input = """\
23
- Replace this text in the input field to see how tokenization works.
24
- Buenos días!
25
- 华为发布Mate60手机。
26
- ラグビーワールドカップ2023フランス"""
27
- # default_tokenizer_name_1 = "Meta/llama3"
28
- default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
29
- default_tokenizer_name_2 = "openai/gpt-4o"
30
-
31
-
32
-
33
- def get_sample_input():
34
- default_inputs = {
35
- "en": "Replace this text in the input field to see how tokenization works.",
36
- "zh-Hans": "",
37
- "es": "",
38
- "de": "",
39
- }
40
- random.seed(10) # For reproducibility
41
- lines = []
42
- for lang in default_inputs.keys():
43
- dataset = load_dataset("eson/cc100-samples", lang, split="train")
44
- print(dataset)
45
- print(1)
46
- return default_inputs
47
-
48
-
49
- examples = {
50
- "en": [
51
- ["number: (10086 + 98) = 100184", "huggyllama/llama-7b", "bigscience/bloom"], #
52
- ["whitespace: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "huggyllama/llama-7b", "google-bert/bert-base-cased"], # chatglm 有blank_n, bert丢掉了空格,
53
- # !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
54
- ["punctuation: ,.:/?+=\",。!?;【】〔〕〖〗", "google/gemma-7b", "huggyllama/llama-7b"], # llama词典有点小
55
- ["symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan-inc/Baichuan-7B", "huggyllama/llama-7b"],
56
- # ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|system|> <|user|> <|assistant|> <|endoftext|>", "", ""],
57
- ],
58
- "zh": [
59
- ["空格测试: 2个空格 8个空格", "llama", "chatglm2_6b"], # chatglm 有blank_n,
60
- ["标点测试:,。!?;", "baichuan_7b", "llama"],
61
- ["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
62
- ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
63
- ["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
64
- ]
65
- }
66
-
67
-
68
-
69
- more_examples = [
70
- # bert系列
71
- ("google-bert/bert-base-cased", "google-bert/bert-base-uncased", "", ""), # # clue VS kplug, bert VS clue
72
- ("bert-base-cased", "clue", "", "增加了[]()"),
73
- ("roberta-chinese-clue", "kplug", "", ""),
74
-
75
- # llama系列 (基于sentencepiece)
76
- ("baichuan", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n\n,do not add dummy prefix as Baichuan1"),
77
- ("llama", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n"),
78
- ("llama", "chinese-llama-2-7b", ""),
79
- ("llama", "llama3", "扩充词典"),
80
- ("chinese-llama-lora-7b", "chinese-llama-2-7b", ""),
81
-
82
- # glm系列 (基于sentencepiece)
83
- ("glm", "chatglm1", ""),
84
- ("chatglm1", "chatglm2", ""),
85
-
86
- # gpt2系列
87
- ("gpt2", "moss", ""),
88
- ("", "", ""),
89
-
90
- # openai系列 (tiktoken)
91
- ("qwen", "gpt_35_turbo", ""),
92
-
93
- ]
94
-
95
- lang = "en"
96
-
97
- example_types = [t[0].split(":")[0] for t in examples[lang]]
98
-
99
-
100
- def example_fn(example_idx):
101
- return examples[lang][example_idx]
102
-
103
-
104
- def get_more_example():
105
- import urllib.parse
106
- url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena"
107
- for tokenizer1, tokenizer2, text, comment in more_examples:
108
- full_url = f'{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}'
109
- print(full_url)
110
-
111
-
112
- if __name__ == "__main__":
113
- get_more_example()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+
3
+ ## characters
4
+
5
+ - alphanumeric characters
6
+ - numeric characters
7
+ - special characters: A special character is a character that is not an alphabetic or numeric character.
8
+ - ASCII control characters
9
+ - punctuation marks
10
+ - accent marks
11
+ - 数学符号
12
+ - whitespace:
13
+ - https://en.wikipedia.org/wiki/Whitespace_character
14
+ - https://emptycharacter.com/
15
+
16
+
17
+ https://www.computerhope.com/jargon/s/specchar.htm
18
+ """
19
+
20
+ import random
21
+ from datasets import load_dataset
22
+
23
+ default_user_input = """\
24
+ Replace this text in the input field to see how tokenization works.
25
+ Buenos días!
26
+ 华为发布Mate60手机。
27
+ ラグビーワールドカップ2023フランス"""
28
+ # default_tokenizer_name_1 = "Meta/llama3"
29
+ # default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
30
+ default_tokenizer_name_1 = "deepseek-ai/DeepSeek-R1"
31
+ default_tokenizer_name_2 = "openai/gpt-4o"
32
+
33
+
34
+ def get_sample_input():
35
+ default_inputs = {
36
+ "en": "Replace this text in the input field to see how tokenization works.",
37
+ "zh-Hans": "",
38
+ "es": "",
39
+ "de": "",
40
+ }
41
+ random.seed(10) # For reproducibility
42
+ lines = []
43
+ for lang in default_inputs.keys():
44
+ dataset = load_dataset("eson/cc100-samples", lang, split="train")
45
+ print(dataset)
46
+ print(1)
47
+ return default_inputs
48
+
49
+
50
+ examples = {
51
+ "en": [
52
+ ["number: (10086 + 98) = 100184", "huggyllama/llama-7b", "bigscience/bloom"], #
53
+ [
54
+ "whitespace: 2spaces 8spaces\t1tab\t\t2tab\n1newline",
55
+ "huggyllama/llama-7b",
56
+ "google-bert/bert-base-cased",
57
+ ], # chatglm 有blank_n, bert丢掉了空格,
58
+ # !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
59
+ [
60
+ 'punctuation: ,.:/?+=",。!?;【】〔〕〖〗',
61
+ "google/gemma-7b",
62
+ "huggyllama/llama-7b",
63
+ ], # llama词典有点小
64
+ [
65
+ "symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤",
66
+ "baichuan-inc/Baichuan-7B",
67
+ "huggyllama/llama-7b",
68
+ ],
69
+ # ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|system|> <|user|> <|assistant|> <|endoftext|>", "", ""],
70
+ ],
71
+ "zh": [
72
+ [
73
+ "空格测试: 2个空格 8个空格",
74
+ "llama",
75
+ "chatglm2_6b",
76
+ ], # chatglm 有blank_n,
77
+ ["标点测试:,。!?;", "baichuan_7b", "llama"],
78
+ [
79
+ "符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤",
80
+ "baichuan_7b",
81
+ "llama",
82
+ ],
83
+ ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
84
+ ["中文简体:宽带,繁体:樂來", "baichuan_7b", "llama"],
85
+ ],
86
+ }
87
+
88
+
89
+ more_examples = [
90
+ # bert系列
91
+ (
92
+ "google-bert/bert-base-cased",
93
+ "google-bert/bert-base-uncased",
94
+ "",
95
+ "",
96
+ ), # # clue VS kplug, bert VS clue
97
+ ("bert-base-cased", "clue", "", "增加了[]()"),
98
+ ("roberta-chinese-clue", "kplug", "", ""),
99
+ # llama系列 (基于sentencepiece)
100
+ (
101
+ "baichuan",
102
+ "baichuan2",
103
+ "baichuan2支持多空格 ,多个换行\n\n\n,do not add dummy prefix as Baichuan1",
104
+ ),
105
+ ("llama", "baichuan2", "baichuan2支持多空格 ,多个换行\n\n"),
106
+ ("llama", "chinese-llama-2-7b", ""),
107
+ ("llama", "llama3", "扩充词典"),
108
+ ("chinese-llama-lora-7b", "chinese-llama-2-7b", ""),
109
+ # glm系列 (基于sentencepiece)
110
+ ("glm", "chatglm1", ""),
111
+ ("chatglm1", "chatglm2", ""),
112
+ # gpt2系列
113
+ ("gpt2", "moss", ""),
114
+ ("", "", ""),
115
+ # openai系列 (tiktoken)
116
+ ("qwen", "gpt_35_turbo", ""),
117
+ ]
118
+
119
+ lang = "en"
120
+
121
+ example_types = [t[0].split(":")[0] for t in examples[lang]]
122
+
123
+
124
+ def example_fn(example_idx):
125
+ return examples[lang][example_idx]
126
+
127
+
128
+ def get_more_example():
129
+ import urllib.parse
130
+
131
+ url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena"
132
+ for tokenizer1, tokenizer2, text, comment in more_examples:
133
+ full_url = f"{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}"
134
+ print(full_url)
135
+
136
+
137
+ if __name__ == "__main__":
138
+ get_more_example()
playground_util.py CHANGED
@@ -6,18 +6,24 @@ from vocab import tokenizer_factory
6
  from character_util import iter_vocab
7
  from utils.log_util import logger
8
  from utils.i18n_util import get_lang
9
- from playground_examples import default_tokenizer_name_1, default_tokenizer_name_2, default_user_input
 
 
 
 
10
  from functools import lru_cache
11
 
12
 
13
  @lru_cache
14
  def _tokenize(
15
- text: str,
16
- tokenizer_name: str,
17
- color_num: int = 5,
18
- add_special_token: bool = False
19
  ):
20
- logger.info("param=" + json.dumps({"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
 
 
 
 
 
21
  pos_tokens = []
22
  tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
23
  if add_special_token:
@@ -28,19 +34,31 @@ def _tokenize(
28
  table = []
29
 
30
  for idx, token_id in enumerate(encoding):
31
- decoded_text = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
 
 
32
  pos_tokens.extend([(decoded_text, str(idx % color_num))])
33
 
34
  # token "Byte": # 这是 utf-8编码吧?
35
- token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[0]
 
 
36
  if isinstance(token, bytes):
37
  try:
38
  token_str = token.decode("utf-8")
39
  except:
40
  token_str = token.decode("utf-8", errors="ignore")
41
- logger.error(f"{idx}: decode_error: " + json.dumps( # gpt_35_turbo 经常有token会decode error,这里用来记录一下
42
- {"tokenizer_type": tokenizer_name, "token": str(token), "token_str": token_str},
43
- ensure_ascii=False))
 
 
 
 
 
 
 
 
44
 
45
  token_bytes = token
46
  # json_dumps = json.dumps(token_str)
@@ -49,8 +67,12 @@ def _tokenize(
49
  token_bytes = bytes(token_str, "utf-8")
50
  # json_dumps = json.dumps(token_str)
51
  else:
52
- logger.error(f"{idx}: wrong type for token {token_id} {type(token)} " + json.dumps(
53
- {"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False))
 
 
 
 
54
  token_str = token
55
  token_bytes = token
56
  # continue
@@ -58,13 +80,14 @@ def _tokenize(
58
  # ⭐
59
  # TODO: gpt3.5_turbo错误: 只有id和text是对的,token和 utf8都是错的。说明 convert_ids_to_tokens 出错了。
60
  table.append(
61
- {"TokenID": token_id,
62
- "Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
63
- "Text": decoded_text, #
64
- # "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
65
- "UTF8 Bytes": str(token_bytes),
66
- # "Unicode": json_dumps # unicode, 如果是ascii码,就直接显示。如果不是ascii码,就显示unicode
67
- }
 
68
  )
69
 
70
  table_df = pd.DataFrame(table)
@@ -73,15 +96,14 @@ def _tokenize(
73
 
74
 
75
  def tokenize(
76
- text: str,
77
- tokenizer_name: str,
78
- color_num: int = 5,
79
- add_special_token: bool = False
80
  ):
81
- """ tokenize wrapper
82
  As gr.Update would be overwritten after passing to frontend, we apply lru_cache in _tokenize.
83
  """
84
- pos_tokens, num_tokens, table_df = _tokenize(text, tokenizer_name, color_num, add_special_token)
 
 
85
  return gr.update(value=pos_tokens, label=f"Tokens: {num_tokens}"), table_df
86
 
87
 
@@ -97,7 +119,7 @@ def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2):
97
  @lru_cache
98
  def basic_count(tokenizer_name):
99
  stats = iter_vocab(tokenizer_name)
100
- return stats['vocab_size'], f'{stats["organization"]}'
101
  # return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
102
 
103
 
@@ -125,9 +147,14 @@ def get_overlap_token_size(tokenizer_name_1, tokenizer_name_2):
125
  vocab_set_2 = set([token.encode("utf-8") for token in vocab_set_2])
126
 
127
  overlap_tokens = vocab_set_1 & vocab_set_2
 
 
 
 
128
  overlap_token_size = len(overlap_tokens)
129
  logger.info(
130
- f"{overlap_token_size} OverlapTokens of {tokenizer_name_1} {tokenizer_name_2}: {list(overlap_tokens)[:10]}")
 
131
  return overlap_token_size, overlap_token_size
132
 
133
 
@@ -166,10 +193,16 @@ def on_load(url_params, request: gr.Request):
166
 
167
 
168
  def test_coding():
169
- bytes1 = b'\xe4\xb8\xad'
170
  print(bytes1) # b'\xe4\xb8\xad'
171
 
172
 
173
  if __name__ == "__main__":
174
- print(get_overlap_token_size("gpt-35-turbo", "gpt-4"))
 
 
 
 
 
 
175
  # print(basic_count("internlm_chat_7b"))
 
6
  from character_util import iter_vocab
7
  from utils.log_util import logger
8
  from utils.i18n_util import get_lang
9
+ from playground_examples import (
10
+ default_tokenizer_name_1,
11
+ default_tokenizer_name_2,
12
+ default_user_input,
13
+ )
14
  from functools import lru_cache
15
 
16
 
17
  @lru_cache
18
  def _tokenize(
19
+ text: str, tokenizer_name: str, color_num: int = 5, add_special_token: bool = False
 
 
 
20
  ):
21
+ logger.info(
22
+ "param="
23
+ + json.dumps(
24
+ {"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False
25
+ )
26
+ )
27
  pos_tokens = []
28
  tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
29
  if add_special_token:
 
34
  table = []
35
 
36
  for idx, token_id in enumerate(encoding):
37
+ decoded_text = tokenizer.decode(
38
+ [token_id]
39
+ ) # 特殊字符解码后会统一变成 �,对应 "\ufffd"
40
  pos_tokens.extend([(decoded_text, str(idx % color_num))])
41
 
42
  # token "Byte": # 这是 utf-8编码吧?
43
+ token = tokenizer.convert_ids_to_tokens([token_id], skip_special_tokens=False)[
44
+ 0
45
+ ]
46
  if isinstance(token, bytes):
47
  try:
48
  token_str = token.decode("utf-8")
49
  except:
50
  token_str = token.decode("utf-8", errors="ignore")
51
+ logger.error(
52
+ f"{idx}: decode_error: "
53
+ + json.dumps( # gpt_35_turbo 经常有token会decode error,这里用来记录一下
54
+ {
55
+ "tokenizer_type": tokenizer_name,
56
+ "token": str(token),
57
+ "token_str": token_str,
58
+ },
59
+ ensure_ascii=False,
60
+ )
61
+ )
62
 
63
  token_bytes = token
64
  # json_dumps = json.dumps(token_str)
 
67
  token_bytes = bytes(token_str, "utf-8")
68
  # json_dumps = json.dumps(token_str)
69
  else:
70
+ logger.error(
71
+ f"{idx}: wrong type for token {token_id} {type(token)} "
72
+ + json.dumps(
73
+ {"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False
74
+ )
75
+ )
76
  token_str = token
77
  token_bytes = token
78
  # continue
 
80
  # ⭐
81
  # TODO: gpt3.5_turbo错误: 只有id和text是对的,token和 utf8都是错的。说明 convert_ids_to_tokens 出错了。
82
  table.append(
83
+ {
84
+ "TokenID": token_id,
85
+ "Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
86
+ "Text": decoded_text, #
87
+ # "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
88
+ "UTF8 Bytes": str(token_bytes),
89
+ # "Unicode": json_dumps # unicode, 如果是ascii码,就直接显示。如果不是ascii码,就显示unicode
90
+ }
91
  )
92
 
93
  table_df = pd.DataFrame(table)
 
96
 
97
 
98
  def tokenize(
99
+ text: str, tokenizer_name: str, color_num: int = 5, add_special_token: bool = False
 
 
 
100
  ):
101
+ """tokenize wrapper
102
  As gr.Update would be overwritten after passing to frontend, we apply lru_cache in _tokenize.
103
  """
104
+ pos_tokens, num_tokens, table_df = _tokenize(
105
+ text, tokenizer_name, color_num, add_special_token
106
+ )
107
  return gr.update(value=pos_tokens, label=f"Tokens: {num_tokens}"), table_df
108
 
109
 
 
119
  @lru_cache
120
  def basic_count(tokenizer_name):
121
  stats = iter_vocab(tokenizer_name)
122
+ return stats["vocab_size"], f'{stats["organization"]}'
123
  # return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}'
124
 
125
 
 
147
  vocab_set_2 = set([token.encode("utf-8") for token in vocab_set_2])
148
 
149
  overlap_tokens = vocab_set_1 & vocab_set_2
150
+ # TODO: visualize the add_tokens, del_tokens in Venn diagram
151
+ # TODO: visualzie the add_tokens, del_tokens in git diff
152
+ # add_tokens = [token for token in vocab_set_2 if token not in vocab_set_1]
153
+ # del_tokens = [token for token in vocab_set_1 if token not in vocab_set_2]
154
  overlap_token_size = len(overlap_tokens)
155
  logger.info(
156
+ f"{overlap_token_size} OverlapTokens of {tokenizer_name_1} {tokenizer_name_2}: {list(overlap_tokens)[:10]}"
157
+ )
158
  return overlap_token_size, overlap_token_size
159
 
160
 
 
193
 
194
 
195
  def test_coding():
196
+ bytes1 = b"\xe4\xb8\xad"
197
  print(bytes1) # b'\xe4\xb8\xad'
198
 
199
 
200
  if __name__ == "__main__":
201
+
202
+ # print(get_overlap_token_size("gpt-35-turbo", "gpt-4"))
203
+ print(
204
+ get_overlap_token_size(
205
+ "gradientai/Llama-3-8B-Instruct-Gradient-1048k", "deepseek-ai/DeepSeek-R1"
206
+ )
207
+ )
208
  # print(basic_count("internlm_chat_7b"))
vocab.py CHANGED
@@ -378,7 +378,7 @@ _all_tokenizer_config = [
378
  TokenizerConfig("deepseek-ai/deepseek-llm-7b-base", org="DeepSeek"),
379
  TokenizerConfig("deepseek-ai/DeepSeek-V2", org="DeepSeek"),
380
  TokenizerConfig("deepseek-ai/DeepSeek-V3", org="DeepSeek"),
381
- TokenizerConfig("deepseek-ai/DeepSeek-R1", org="DeepSeek"),
382
  TokenizerConfig("deepseek-ai/DeepSeek-R1-Zero", org="DeepSeek"),
383
  TokenizerConfig("deepseek-ai/DeepSeek-R1-Distill-Llama-70B", org="DeepSeek"),
384
 
 
378
  TokenizerConfig("deepseek-ai/deepseek-llm-7b-base", org="DeepSeek"),
379
  TokenizerConfig("deepseek-ai/DeepSeek-V2", org="DeepSeek"),
380
  TokenizerConfig("deepseek-ai/DeepSeek-V3", org="DeepSeek"),
381
+ TokenizerConfig("deepseek-ai/DeepSeek-R1", org="DeepSeek"), # 在llama3的词典上,增加了一些中文token,删掉了一部分token
382
  TokenizerConfig("deepseek-ai/DeepSeek-R1-Zero", org="DeepSeek"),
383
  TokenizerConfig("deepseek-ai/DeepSeek-R1-Distill-Llama-70B", org="DeepSeek"),
384