5w4n's picture
Add token distribution chart and token count variability
6841f1c
raw
history blame
3.51 kB
import streamlit as st
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
@st.cache_data
def load_data():
return pd.read_csv("dataset.csv")
def reload_example_text_data(selected_language, selected_tokenizers):
tempdf = val_data[val_data["lang"] == selected_language]
random_sample = tempdf.sample(n=1)
selected_text = random_sample["text"].iloc[0]
random_sample = random_sample[selected_tokenizers]
random_sample.columns = [f"{tokenizer}" for tokenizer in selected_tokenizers]
st.session_state.examplesdf = random_sample
return selected_text
val_data = load_data()
tokenizer_names_to_test = [
"openai/gpt4",
"Xenova/gpt-4o",
"Xenova/claude-tokenizer",
"CohereForAI/aya-101",
"meta-llama/Meta-Llama-3-70B",
"mistralai/Mixtral-8x22B-v0.1",
"google/gemma-7b",
"facebook/nllb-200-distilled-600M",
"xlm-roberta-base",
"bert-base-uncased",
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
"bigscience/bloom",
"StabilityAI/stablelm-base-alpha-7b",
"google/flan-t5-base",
"facebook/mbart-large-50",
"EleutherAI/gpt-neox-20b",
]
with st.sidebar:
all_tokenizers = st.checkbox("Select All Tokenizers")
if all_tokenizers:
selected_tokenizers = tokenizer_names_to_test
else:
selected_tokenizers = st.multiselect(
"Select tokenizers",
options=tokenizer_names_to_test,
default=[
"openai/gpt4",
"Xenova/gpt-4o",
"CohereForAI/aya-101",
"Xenova/claude-tokenizer",
],
label_visibility="collapsed",
)
links = [
(
f"[{tokenizer_name}](https://huggingface.co/{tokenizer_name})"
if tokenizer_name != "openai/gpt4"
else f"[{tokenizer_name}](https://github.com/openai/tiktoken)"
)
for tokenizer_name in selected_tokenizers
]
link = "Tokenized using " + ", ".join(links)
st.markdown(link, unsafe_allow_html=True)
language_options = sorted(val_data["lang"].unique())
selected_language = st.selectbox(
"Select language",
options=language_options,
index=language_options.index("English") if "English" in language_options else 0,
label_visibility="collapsed",
)
selected_text = reload_example_text_data(selected_language, selected_tokenizers)
st.subheader(f"**Sampled Text:** `{selected_text}`")
st.subheader("Number of Tokens")
st.table(st.session_state.examplesdf)
# Create a distribution plot for token density across selected tokenizers
import plotly.figure_factory as ff
# Collecting data for all selected tokenizers
hist_data = [val_data[tokenizer].dropna() for tokenizer in selected_tokenizers]
# Creating the distplot with optional histogram
fig = ff.create_distplot(
hist_data, selected_tokenizers, show_hist=False, show_rug=False
)
fig.update_layout(
title="Token Distribution Density",
xaxis_title="Number of Tokens",
yaxis_title="Density",
height=500,
)
st.plotly_chart(fig, use_container_width=True)
tokenizer_to_num_tokens = {
name: val_data[name].tolist() for name in selected_tokenizers
}
fig = go.Figure()
for tokenizer_name in selected_tokenizers:
fig.add_trace(
go.Box(y=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name)
)
fig.update_layout(title="Token Count Variability")
st.plotly_chart(fig)