PliniusNatHist / app.py
bestroi's picture
Update app.py
bcd03e9 verified
raw
history blame
5.68 kB
import streamlit as st
import pandas as pd
import plotly.express as px
import nltk
from nltk.tokenize import word_tokenize
import os
# Ensure NLTK 'punkt' tokenizer is downloaded
nltk.download('punkt', quiet=True)
def count_tokens(text):
"""Count the number of tokens in a given text."""
if isinstance(text, str):
tokens = word_tokenize(text)
return len(tokens)
return 0
def extract_number(entry):
"""
Extracts a floating-point number following the substring "plin. nat." in the entry.
Returns 0.0 if the pattern is not found or conversion fails.
"""
search_str = "plin. nat."
start_index = entry.find(search_str)
if start_index == -1:
return 0.0
start_index += len(search_str)
num_str = ''
for char in entry[start_index:]:
if char.isdigit() or char == '.':
num_str += char
else:
break
try:
return float(num_str) if num_str else 0.0
except ValueError:
return 0.0
def visualize_data(csv_file, sort_entries=False):
"""Reads the CSV file, processes data, and visualizes it using Streamlit."""
if not os.path.exists(csv_file):
st.error(f"The file '{csv_file}' does not exist. Please check the file path.")
return
try:
data = pd.read_csv(csv_file)
except Exception as e:
st.error(f"Error reading '{csv_file}': {e}")
return
# Check for necessary columns
required_columns = {'Book/Chapter', 'Context', 'Lemma'}
if not required_columns.issubset(data.columns):
st.error(f"The CSV file must contain the following columns: {required_columns}")
return
if sort_entries:
data['SortKey'] = data['Book/Chapter'].apply(extract_number)
data = data.sort_values(by='SortKey')
data.drop('SortKey', axis=1, inplace=True)
data['token_count'] = data['Context'].apply(count_tokens)
# Group by 'Lemma' to get frequency and average token count
lemma_stats = data.groupby('Lemma').agg({
'Context': 'count',
'token_count': 'mean'
}).reset_index()
lemma_stats.rename(columns={'Context': 'Frequency', 'token_count': 'Average Token Count'}, inplace=True)
st.subheader("Basic Statistics")
st.table(lemma_stats)
# Bar Chart: Lemma Frequency
fig_bar = px.bar(
lemma_stats,
x='Lemma',
y='Frequency',
color='Lemma',
labels={'Frequency': 'Frequency'},
title='Lemma Frequency in the Dataset'
)
st.plotly_chart(fig_bar)
# Pie Chart: Lemma Frequency Distribution
# To avoid clutter, show top 10 lemmas and aggregate the rest
top_n = 10
top_lemmas = lemma_stats.nlargest(top_n, 'Frequency')
others = lemma_stats['Frequency'].sum() - top_lemmas['Frequency'].sum()
pie_data = top_lemmas.append(pd.DataFrame({
'Lemma': ['Others'],
'Frequency': [others]
}), ignore_index=True)
fig_pie = px.pie(
pie_data,
values='Frequency',
names='Lemma',
title='Lemma Frequency Distribution (Top 10)'
)
st.plotly_chart(fig_pie)
# Chapter-wise Lemma Mentions
chapter_stats = data.groupby(['Lemma', 'Book/Chapter']).size().reset_index(name='Count')
chapter_pivot = chapter_stats.pivot(index='Book/Chapter', columns='Lemma', values='Count').fillna(0)
fig_chapter = px.bar(
chapter_pivot,
barmode='stack',
labels={'index': 'Book/Chapter', 'value': 'Count'},
title='Chapter-wise Lemma Mentions'
)
st.plotly_chart(fig_chapter)
# Most Common Lemma
most_common_lemma = lemma_stats.loc[lemma_stats['Frequency'].idxmax()]['Lemma']
st.write(f"**Most Common Lemma:** {most_common_lemma}")
# Expander to show detailed context
with st.expander("View Detailed Contexts"):
for index, row in data.iterrows():
st.markdown(f"**Lemma:** {row['Lemma']}")
st.markdown(f"**Book/Chapter:** {row['Book/Chapter']}")
st.markdown(f"**Context:** {row['Context']}")
st.markdown("---")
def main():
"""Main function to set up the Streamlit app."""
st.set_page_config(page_title="Lemma Frequency Visualization", layout="wide")
st.title("Lemma Frequency Visualization")
# Sidebar configuration
with st.sidebar:
# Display image if it exists
image_path = "imgs/DiGi_Thrace_logo-tall.jpg"
if os.path.exists(image_path):
st.image(image_path, use_column_width=True)
else:
st.warning(f"Image '{image_path}' not found.")
st.markdown("""
### The Dataset:
The dataset is a curated collection of information on ancient geographical locations, rivers, tribes, and cultural aspects as documented by Pliny the Elder in *Naturalis Historia*. It includes lemmas (base forms of words), contextual information, and references to specific books and chapters from Pliny's work.
_Measuring Ancient Thrace: Re-evaluating Antiquity in the Digital Age_
**Project no. КП-06-Н50/3 from 30.11.2020, financed by BNSF**
""")
# File selection
csv_files = ["allData.csv", "places.csv", "ethnonyms.csv", "rivers.csv", "mountains.csv", "toponyms.csv"]
csv_file = st.selectbox("Select CSV file:", csv_files)
# Option to sort entries
sort_entries = st.checkbox("Sort entries based on 'Book/Chapter'")
# Visualize data based on user selection
visualize_data(csv_file, sort_entries=sort_entries)
if __name__ == "__main__":
main()