|
|
|
import gradio as gr |
|
from transformers import AutoTokenizer |
|
|
|
|
|
tokenizers = { |
|
"English - BERT (bert-base-uncased)": "bert-base-uncased", |
|
"Arabic - CAMeL BERT (bert-base-arabic-camelbert-ca)": "CAMeL-Lab/bert-base-arabic-camelbert-ca", |
|
"Arabic - AraBERT (asafaya/bert-base-arabic)": "asafaya/bert-base-arabic" |
|
} |
|
|
|
|
|
def tokenize_text(text,model_name): |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(tokenizers[model_name]) |
|
|
|
|
|
tokens = tokenizer.tokenize(text) |
|
|
|
|
|
return f"Tokens: {tokens}", f"Number of tokens: {len(tokens)}" |
|
|
|
|
|
model_choice = gr.Dropdown(choices=list(tokenizers.keys()), label="Select Tokenizer", value="English - BERT (bert-base-uncased)") |
|
text_input = gr.Textbox(label="Enter a sentence to tokenize") |
|
|
|
|
|
|
|
|
|
examples = [ |
|
["The quick brown fox jumps over the lazy dog.", "English - BERT (bert-base-uncased)"], |
|
["ุงููู
ุฑ ุฌู
ูู ูู ุงูุณู
ุงุก.", "Arabic - CAMeL BERT (bert-base-arabic-camelbert-ca)"] |
|
] |
|
|
|
|
|
demo = gr.Interface( |
|
fn=tokenize_text, |
|
inputs=[text_input,model_choice], |
|
outputs=[gr.Textbox(label="Tokens"), gr.Textbox(label="Number of Tokens")], |
|
title="Hugging Face Tokenizer Explorer", |
|
description="Enter a sentence or use one of the example sentences below to see how different tokenizers work.", |
|
examples=examples, |
|
allow_flagging=False, |
|
|
|
) |
|
demo.launch() |
|
|