Spaces:
Runtime error
Runtime error
File size: 2,203 Bytes
ce44288 3fff62f ce44288 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
from typing import List
import gradio as gr
from transformers import AutoTokenizer
import tiktoken
hf_tokenizer_list = [
("tugstugi/bert-large-mongolian-cased", False),
("tugstugi/bert-large-mongolian-uncased", False),
("bayartsogt/mongolian-roberta-large", True),
("meta-llama/Llama-2-13b-hf", True),
("bigscience/bloom", True),
]
openai_tokenizer_list = [
"text-davinci-003",
"gpt-4"
]
# load tokenizers
hf_tokenizers = [
AutoTokenizer.from_pretrained(model_name_or_id, use_fast=use_fast)
for model_name_or_id, use_fast in hf_tokenizer_list
]
openai_tokenizers = [
tiktoken.encoding_for_model(name)
for name in openai_tokenizer_list
]
def do_tokenize(tokenizer: AutoTokenizer, text: str) -> List[str]:
return [(tokenizer.decode([token_id]), str(i)) for i, token_id in enumerate(tokenizer.encode(text))]
def do_simple_split(text: str):
return [(x, str(i)) for i, x in enumerate(text.split())]
def do_function(text: str):
return (
text,
len(text),
do_simple_split(text),
*[do_tokenize(tokenizer, text) for tokenizer in hf_tokenizers],
*[do_tokenize(tokenizer, text) for tokenizer in openai_tokenizers],
)
demo = gr.Interface(
do_function,
[
gr.Text("", placeholder="Мөнгөө тушаачихсаныхаа дараа мэдэгдээрэй")
],
[
gr.Text("", label="input"),
gr.Number(0, label="Character Count"),
gr.HighlightedText("", label="Simple Split"),
*[gr.HighlightedText("", label=tokenizer_name) for tokenizer_name, _ in hf_tokenizer_list],
*[gr.HighlightedText("", label="openai/" + tokenizer_name) for tokenizer_name in openai_tokenizer_list],
],
live=True,
allow_flagging="never",
title="Real-Time Tokenizer",
description=(
"**Tokenizers:**\n" +
"\n".join(
[
f"🤗 [{x}](https://huggingface.co/{x})"
for x, _ in hf_tokenizer_list
] + [
f"⏳ [{x}](https://github.com/openai/tiktoken)"
for x in openai_tokenizer_list
])
),
)
if __name__ == "__main__":
demo.launch()
|