Spaces:
Runtime error
Runtime error
update code to match updated data with pre-calculated token lens
Browse files
app.py
CHANGED
@@ -11,6 +11,7 @@ import plotly.figure_factory as ff
|
|
11 |
import plotly.express as px
|
12 |
|
13 |
tokenizer_names_to_test = [
|
|
|
14 |
"xlm-roberta-base", # old style
|
15 |
"bert-base-uncased", # old style
|
16 |
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
@@ -51,11 +52,11 @@ with st.sidebar:
|
|
51 |
# st.success('Completed.')
|
52 |
|
53 |
with st.container():
|
|
|
|
|
|
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
subset_df = val_data[val_data.lang.isin(languages)]
|
58 |
-
subset_data = [val_data[val_data.lang==_lang][tokenizer_name] for _lang in languages]
|
59 |
fig = ff.create_distplot(subset_data, group_labels=languages, show_hist=False)
|
60 |
st.plotly_chart(fig, use_container_width=True)
|
61 |
|
|
|
11 |
import plotly.express as px
|
12 |
|
13 |
tokenizer_names_to_test = [
|
14 |
+
"openai/gpt4",
|
15 |
"xlm-roberta-base", # old style
|
16 |
"bert-base-uncased", # old style
|
17 |
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
|
|
52 |
# st.success('Completed.')
|
53 |
|
54 |
with st.container():
|
55 |
+
if tokenizer_name in val_data.columns:
|
56 |
+
subset_df = val_data[val_data.lang.isin(languages)]
|
57 |
+
subset_data = [val_data[val_data.lang==_lang][tokenizer_name] for _lang in languages]
|
58 |
|
59 |
+
|
|
|
|
|
|
|
60 |
fig = ff.create_distplot(subset_data, group_labels=languages, show_hist=False)
|
61 |
st.plotly_chart(fig, use_container_width=True)
|
62 |
|
main.py
DELETED
@@ -1,71 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
from collections import defaultdict
|
3 |
-
import tqdm
|
4 |
-
import transformers
|
5 |
-
from transformers import AutoTokenizer
|
6 |
-
import pandas as pd
|
7 |
-
import matplotlib.pyplot as plt
|
8 |
-
import seaborn as sns
|
9 |
-
import numpy as np
|
10 |
-
import plotly.figure_factory as ff
|
11 |
-
import plotly.express as px
|
12 |
-
|
13 |
-
tokenizer_names_to_test = [
|
14 |
-
"xlm-roberta-base", # old style
|
15 |
-
"bert-base-uncased", # old style
|
16 |
-
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
17 |
-
"bigscience/bloom", # HuggingFace
|
18 |
-
"StabilityAI/stablelm-base-alpha-7b", # StableLM with Open Assistant
|
19 |
-
"google/flan-t5-base", # Flan T5 (better than T5), Google
|
20 |
-
"facebook/mbart-large-50", # Facebook
|
21 |
-
"facebook/nllb-200-distilled-600M", # Facebook
|
22 |
-
"EleutherAI/gpt-neox-20b", # same as Pythia
|
23 |
-
]
|
24 |
-
|
25 |
-
with st.sidebar:
|
26 |
-
with st.spinner('Loading dataset...'):
|
27 |
-
val_data = pd.read_csv('MassiveDatasetValidationData.csv')
|
28 |
-
st.success(f'Data loaded: {len(val_data)}')
|
29 |
-
|
30 |
-
languages = st.multiselect(
|
31 |
-
'Select languages',
|
32 |
-
options=sorted(val_data.lang.unique()),
|
33 |
-
default=['English', 'Spanish' ,'Chinese'],
|
34 |
-
max_selections=5
|
35 |
-
)
|
36 |
-
|
37 |
-
# TODO multi-select tokenizers
|
38 |
-
# TODO add openai to this options
|
39 |
-
tokenizer_name = st.sidebar.selectbox('Tokenizers', options=tokenizer_names_to_test)
|
40 |
-
st.write('You selected:', tokenizer_name)
|
41 |
-
|
42 |
-
# with st.spinner('Loading tokenizer...'):
|
43 |
-
# tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
|
44 |
-
# st.success(f'Tokenizer loaded: {tokenizer_name}')
|
45 |
-
|
46 |
-
# # TODO - preload the tokenized versions ... much easier!
|
47 |
-
# # TODO - add the metadata data as well??? later on maybe
|
48 |
-
# with st.spinner('Calculating tokenization for data...'):
|
49 |
-
# if tokenizer_name not in val_data.columns:
|
50 |
-
# val_data[f'{tokenizer_name}'] = val_data.text.apply(lambda x: len(tokenizer.encode(x)))
|
51 |
-
# st.success('Completed.')
|
52 |
-
|
53 |
-
with st.container():
|
54 |
-
|
55 |
-
tokenizer_name = 'num_tokens_openai'
|
56 |
-
|
57 |
-
subset_df = val_data[val_data.lang.isin(languages)]
|
58 |
-
subset_data = [val_data[val_data.lang==_lang][tokenizer_name] for _lang in languages]
|
59 |
-
fig = ff.create_distplot(subset_data, group_labels=languages, show_hist=False)
|
60 |
-
st.plotly_chart(fig, use_container_width=True)
|
61 |
-
|
62 |
-
|
63 |
-
# for _lang in languages:
|
64 |
-
# subset = val_data[val_data.lang==_lang]
|
65 |
-
|
66 |
-
# fig = ff.create_distplot(val_data, bin_size=.5,
|
67 |
-
# curve_type='normal', # override default 'kde'
|
68 |
-
# colors=colors)
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|