Spaces:
Runtime error
Runtime error
import os | |
from typing import List | |
import gradio as gr | |
from transformers import AutoTokenizer | |
import tiktoken | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
hf_tokenizer_list = [ | |
("tugstugi/bert-large-mongolian-cased", False), | |
("tugstugi/bert-large-mongolian-uncased", False), | |
("bayartsogt/mongolian-roberta-large", True), | |
("meta-llama/Llama-2-13b-hf", True), | |
("tiiuae/falcon-7b", True), | |
("bigscience/bloom", True), | |
] | |
openai_tokenizer_list = [ | |
"text-davinci-003", | |
"gpt-4" | |
] | |
# load tokenizers | |
hf_tokenizers = [ | |
AutoTokenizer.from_pretrained(model_name_or_id, use_fast=use_fast, trust_remote_code=True, token=HF_TOKEN) | |
for model_name_or_id, use_fast in hf_tokenizer_list | |
] | |
openai_tokenizers = [ | |
tiktoken.encoding_for_model(name) | |
for name in openai_tokenizer_list | |
] | |
def do_tokenize(tokenizer: AutoTokenizer, text: str) -> List[str]: | |
return [(tokenizer.decode([token_id]), str(i)) for i, token_id in enumerate(tokenizer.encode(text))] | |
def do_simple_split(text: str): | |
return [(x, str(i)) for i, x in enumerate(text.split())] | |
def do_function(text: str): | |
return ( | |
text, | |
len(text), | |
do_simple_split(text), | |
*[do_tokenize(tokenizer, text) for tokenizer in hf_tokenizers], | |
*[do_tokenize(tokenizer, text) for tokenizer in openai_tokenizers], | |
) | |
demo = gr.Interface( | |
do_function, | |
[ | |
gr.Text("", placeholder="Мөнгөө тушаачихсаныхаа дараа мэдэгдээрэй") | |
], | |
[ | |
gr.Text("", label="input"), | |
gr.Number(0, label="Character Count"), | |
gr.HighlightedText("", label="Simple Split"), | |
*[gr.HighlightedText("", label=tokenizer_name) for tokenizer_name, _ in hf_tokenizer_list], | |
*[gr.HighlightedText("", label="openai/" + tokenizer_name) for tokenizer_name in openai_tokenizer_list], | |
], | |
live=True, | |
allow_flagging="never", | |
title="Real-Time Tokenizer", | |
description=( | |
"**Tokenizers:**\n" + | |
"\n".join( | |
[ | |
f"🤗 [{x}](https://huggingface.co/{x})" | |
for x, _ in hf_tokenizer_list | |
] + [ | |
f"⏳ [{x}](https://github.com/openai/tiktoken)" | |
for x in openai_tokenizer_list | |
]) | |
), | |
) | |
if __name__ == "__main__": | |
demo.launch() | |