Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import plotly.express as px | |
import nltk | |
nltk.download('punkt') | |
def count_tokens(text): | |
tokens = nltk.word_tokenize(text) | |
return len(tokens) | |
def extract_number(entry): | |
start_index = entry.find("plin. nat.") + len("plin. nat.") | |
num_str = '' | |
for char in entry[start_index:]: | |
if char.isdigit() or char == '.': | |
num_str += char | |
else: | |
break | |
return float(num_str) if num_str else 0.0 | |
def visualize_data(csv_file, sort_entries=False): | |
data = pd.read_csv(csv_file) | |
if sort_entries: | |
data['SortKey'] = data['Book/Chapter'].apply(extract_number) | |
data = data.sort_values(by='SortKey') | |
data['token_count'] = data['Context'].apply(count_tokens) | |
lemma_stats = data.groupby('Lemma').agg({'Context': 'count', 'token_count': 'mean'}).reset_index() | |
st.write("Basic Statistics:") | |
st.table(lemma_stats) | |
fig_bar = px.bar( | |
lemma_stats, | |
x='Lemma', | |
y='Context', | |
color='Lemma', | |
labels={'Context': 'Frequency'}, | |
title='Lemma Frequency in the Dataset' | |
) | |
st.plotly_chart(fig_bar) | |
lemma_stats_additional = data['Lemma'].value_counts().reset_index() | |
lemma_stats_additional.columns = ['Lemma', 'Frequency'] | |
most_common_lemma_additional = lemma_stats_additional.iloc[0]['Lemma'] | |
chapter_stats_additional = data.groupby(['Lemma', 'Book/Chapter']).size().unstack(fill_value=0) | |
fig_pie = px.pie( | |
lemma_stats_additional, | |
values='Frequency', | |
names='Lemma', | |
title='Lemma Frequency Distribution' | |
) | |
st.plotly_chart(fig_pie) | |
fig_additional = px.bar( | |
chapter_stats_additional, | |
barmode='stack', | |
labels={'index': 'Book/Chapter'}, | |
title='Chapter-wise Lemma Mentions' | |
) | |
st.plotly_chart(fig_additional) | |
st.write(f"Most Common Lemma: {most_common_lemma_additional}") | |
with st.expander("Click to view context"): | |
for index, row in data.iterrows(): | |
st.write(f"Lemma: {row['Lemma']}") | |
st.write(f"Book/Chapter: {row['Book/Chapter']}") | |
st.write(f"Context: {row['Context']}") | |
st.write('-' * 50) | |
def main(): | |
st.title("Lemma Frequency Visualization") | |
csv_file = st.sidebar.selectbox("Select CSV file:", ["allData.csv","places.csv","ethnonyms.csv","rivers.csv","mountains.csv","toponyms.csv"]) | |
visualize_data(csv_file) | |
if __name__ == "__main__": | |
main() |