Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| import numpy as np | |
| def load_data(): | |
| return pd.read_csv("dataset.csv") | |
| def reload_example_text_data(selected_language, selected_tokenizers): | |
| tempdf = val_data[val_data["lang"] == selected_language] | |
| random_sample = tempdf.sample(n=1) | |
| selected_text = random_sample["text"].iloc[0] | |
| random_sample = random_sample[selected_tokenizers] | |
| random_sample.columns = [f"{tokenizer}" for tokenizer in selected_tokenizers] | |
| st.session_state.examplesdf = random_sample | |
| return selected_text | |
| val_data = load_data() | |
| tokenizer_names_to_test = [ | |
| "openai/gpt4", | |
| "Xenova/gpt-4o", | |
| "Xenova/claude-tokenizer", | |
| "CohereForAI/aya-101", | |
| "meta-llama/Meta-Llama-3-70B", | |
| "mistralai/Mixtral-8x22B-v0.1", | |
| "google/gemma-7b", | |
| "facebook/nllb-200-distilled-600M", | |
| "xlm-roberta-base", | |
| "bert-base-uncased", | |
| "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", | |
| "bigscience/bloom", | |
| "StabilityAI/stablelm-base-alpha-7b", | |
| "google/flan-t5-base", | |
| "facebook/mbart-large-50", | |
| "EleutherAI/gpt-neox-20b", | |
| ] | |
| with st.sidebar: | |
| all_tokenizers = st.checkbox("Select All Tokenizers") | |
| if all_tokenizers: | |
| selected_tokenizers = tokenizer_names_to_test | |
| else: | |
| selected_tokenizers = st.multiselect( | |
| "Select tokenizers", | |
| options=tokenizer_names_to_test, | |
| default=[ | |
| "openai/gpt4", | |
| "Xenova/gpt-4o", | |
| "CohereForAI/aya-101", | |
| "Xenova/claude-tokenizer", | |
| ], | |
| label_visibility="collapsed", | |
| ) | |
| links = [ | |
| ( | |
| f"[{tokenizer_name}](https://huggingface.co/{tokenizer_name})" | |
| if tokenizer_name != "openai/gpt4" | |
| else f"[{tokenizer_name}](https://github.com/openai/tiktoken)" | |
| ) | |
| for tokenizer_name in selected_tokenizers | |
| ] | |
| link = "Tokenized using " + ", ".join(links) | |
| st.markdown(link, unsafe_allow_html=True) | |
| language_options = sorted(val_data["lang"].unique()) | |
| selected_language = st.selectbox( | |
| "Select language", | |
| options=language_options, | |
| index=language_options.index("English") if "English" in language_options else 0, | |
| label_visibility="collapsed", | |
| ) | |
| selected_text = reload_example_text_data(selected_language, selected_tokenizers) | |
| st.subheader(f"**Sampled Text:** `{selected_text}`") | |
| st.subheader("Number of Tokens") | |
| st.table(st.session_state.examplesdf) | |
| # Create a distribution plot for token density across selected tokenizers | |
| import plotly.figure_factory as ff | |
| # Collecting data for all selected tokenizers | |
| hist_data = [val_data[tokenizer].dropna() for tokenizer in selected_tokenizers] | |
| # Creating the distplot with optional histogram | |
| fig = ff.create_distplot( | |
| hist_data, selected_tokenizers, show_hist=False, show_rug=False | |
| ) | |
| fig.update_layout( | |
| title="Token Distribution Density", | |
| xaxis_title="Number of Tokens", | |
| yaxis_title="Density", | |
| height=500, | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| tokenizer_to_num_tokens = { | |
| name: val_data[name].tolist() for name in selected_tokenizers | |
| } | |
| fig = go.Figure() | |
| for tokenizer_name in selected_tokenizers: | |
| fig.add_trace( | |
| go.Box(y=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name) | |
| ) | |
| fig.update_layout(title="Token Count Variability") | |
| st.plotly_chart(fig) | |