PliniusNatHist / app.py
bestroi's picture
Update app.py
79c09ca
raw
history blame
3.47 kB
import streamlit as st
import pandas as pd
import plotly.express as px
import nltk
nltk.download('punkt')
def count_tokens(text):
tokens = nltk.word_tokenize(text)
return len(tokens)
def extract_number(entry):
start_index = entry.find("plin. nat.") + len("plin. nat.")
num_str = ''
for char in entry[start_index:]:
if char.isdigit() or char == '.':
num_str += char
else:
break
return float(num_str) if num_str else 0.0
def visualize_data(csv_file, sort_entries=False):
# Load data from CSV file
data = pd.read_csv(csv_file)
# Sorting by the first number after "plin. nat." if specified
if sort_entries:
data['SortKey'] = data['Book/Chapter'].apply(extract_number)
data = data.sort_values(by='SortKey')
data['Token_Count'] = data['Context'].apply(count_tokens)
# Basic statistics
lemma_stats = data.groupby('Lemma').agg({'Context': 'count', 'Token_Count': 'mean'}).reset_index()
# Display the basic statistics using st.table()
st.write("Basic Statistics:")
st.table(lemma_stats)
# Bar chart for lemma frequency using Plotly Express
fig_bar = px.bar(
lemma_stats,
x='Lemma',
y='Context',
color='Lemma',
labels={'Context': 'Frequency'},
title='Lemma Frequency in the Dataset'
)
# Display the bar chart using st.plotly_chart()
st.plotly_chart(fig_bar)
# Additional Visualization
# Basic statistics for additional data
lemma_stats_additional = data['Lemma'].value_counts().reset_index()
lemma_stats_additional.columns = ['Lemma', 'Frequency']
# Find the most common lemma
most_common_lemma_additional = lemma_stats_additional.iloc[0]['Lemma']
# Distribution across chapters
chapter_stats_additional = data.groupby(['Lemma', 'Book/Chapter']).size().unstack(fill_value=0)
# Create a pie chart for lemma frequency using Plotly Express
fig_pie = px.pie(
lemma_stats_additional,
values='Frequency',
names='Lemma',
title='Lemma Frequency Distribution'
)
# Display the pie chart using st.plotly_chart()
st.plotly_chart(fig_pie)
# Create a subplot for chapter-wise lemma mentions
fig_additional = px.bar(
chapter_stats_additional,
barmode='stack',
labels={'index': 'Book/Chapter'},
title='Chapter-wise Lemma Mentions'
)
# Display the subplot using st.plotly_chart()
st.plotly_chart(fig_additional)
# Display the most common lemma
st.write(f"Most Common Lemma: {most_common_lemma_additional}")
# Expandable section to display context
with st.expander("Click to view context"):
# Display context for each entry
for index, row in data.iterrows():
st.write(f"Lemma: {row['Lemma']}")
st.write(f"Book/Chapter: {row['Book/Chapter']}")
st.write(f"Context: {row['Context']}")
st.write('-' * 50)
def main():
st.title("Lemma Frequency Visualization")
# File selection
csv_file = st.sidebar.selectbox("Select CSV file:", ["allData.csv","places.csv","ethonyms.csv","rivers.csv","mountains.csv","toponyms.csv"])
# Option to sort entries
sort_entries = st.sidebar.checkbox("Sort entries by plin. nat. number")
# Visualization based on selected file and option to sort
visualize_data(csv_file, sort_entries)
if __name__ == "__main__":
main()