Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import plotly.express as px | |
import nltk | |
from nltk.tokenize import word_tokenize | |
import os | |
# Ensure NLTK 'punkt' tokenizer is downloaded | |
nltk.download('punkt', quiet=True) | |
def count_tokens(text): | |
"""Count the number of tokens in a given text.""" | |
if isinstance(text, str): | |
tokens = word_tokenize(text) | |
return len(tokens) | |
return 0 | |
def extract_number(entry): | |
""" | |
Extracts a floating-point number following the substring "plin. nat." in the entry. | |
Returns 0.0 if the pattern is not found or conversion fails. | |
""" | |
search_str = "plin. nat." | |
start_index = entry.find(search_str) | |
if start_index == -1: | |
return 0.0 | |
start_index += len(search_str) | |
num_str = '' | |
for char in entry[start_index:]: | |
if char.isdigit() or char == '.': | |
num_str += char | |
else: | |
break | |
try: | |
return float(num_str) if num_str else 0.0 | |
except ValueError: | |
return 0.0 | |
def visualize_data(csv_file, sort_entries=False): | |
"""Reads the CSV file, processes data, and visualizes it using Streamlit.""" | |
if not os.path.exists(csv_file): | |
st.error(f"The file '{csv_file}' does not exist. Please check the file path.") | |
return | |
try: | |
data = pd.read_csv(csv_file) | |
except Exception as e: | |
st.error(f"Error reading '{csv_file}': {e}") | |
return | |
# Check for necessary columns | |
required_columns = {'Book/Chapter', 'Context', 'Lemma'} | |
if not required_columns.issubset(data.columns): | |
st.error(f"The CSV file must contain the following columns: {required_columns}") | |
return | |
if sort_entries: | |
data['SortKey'] = data['Book/Chapter'].apply(extract_number) | |
data = data.sort_values(by='SortKey') | |
data.drop('SortKey', axis=1, inplace=True) | |
data['token_count'] = data['Context'].apply(count_tokens) | |
# Group by 'Lemma' to get frequency and average token count | |
lemma_stats = data.groupby('Lemma').agg({ | |
'Context': 'count', | |
'token_count': 'mean' | |
}).reset_index() | |
lemma_stats.rename(columns={'Context': 'Frequency', 'token_count': 'Average Token Count'}, inplace=True) | |
st.subheader("Basic Statistics") | |
st.table(lemma_stats) | |
# Bar Chart: Lemma Frequency | |
fig_bar = px.bar( | |
lemma_stats, | |
x='Lemma', | |
y='Frequency', | |
color='Lemma', | |
labels={'Frequency': 'Frequency'}, | |
title='Lemma Frequency in the Dataset' | |
) | |
st.plotly_chart(fig_bar) | |
# Pie Chart: Lemma Frequency Distribution | |
# To avoid clutter, show top 10 lemmas and aggregate the rest | |
top_n = 10 | |
top_lemmas = lemma_stats.nlargest(top_n, 'Frequency') | |
others = lemma_stats['Frequency'].sum() - top_lemmas['Frequency'].sum() | |
pie_data = top_lemmas.append(pd.DataFrame({ | |
'Lemma': ['Others'], | |
'Frequency': [others] | |
}), ignore_index=True) | |
fig_pie = px.pie( | |
pie_data, | |
values='Frequency', | |
names='Lemma', | |
title='Lemma Frequency Distribution (Top 10)' | |
) | |
st.plotly_chart(fig_pie) | |
# Chapter-wise Lemma Mentions | |
chapter_stats = data.groupby(['Lemma', 'Book/Chapter']).size().reset_index(name='Count') | |
chapter_pivot = chapter_stats.pivot(index='Book/Chapter', columns='Lemma', values='Count').fillna(0) | |
fig_chapter = px.bar( | |
chapter_pivot, | |
barmode='stack', | |
labels={'index': 'Book/Chapter', 'value': 'Count'}, | |
title='Chapter-wise Lemma Mentions' | |
) | |
st.plotly_chart(fig_chapter) | |
# Most Common Lemma | |
most_common_lemma = lemma_stats.loc[lemma_stats['Frequency'].idxmax()]['Lemma'] | |
st.write(f"**Most Common Lemma:** {most_common_lemma}") | |
# Expander to show detailed context | |
with st.expander("View Detailed Contexts"): | |
for index, row in data.iterrows(): | |
st.markdown(f"**Lemma:** {row['Lemma']}") | |
st.markdown(f"**Book/Chapter:** {row['Book/Chapter']}") | |
st.markdown(f"**Context:** {row['Context']}") | |
st.markdown("---") | |
def main(): | |
"""Main function to set up the Streamlit app.""" | |
st.set_page_config(page_title="Lemma Frequency Visualization", layout="wide") | |
st.title("Lemma Frequency Visualization") | |
# Sidebar configuration | |
with st.sidebar: | |
# Display image if it exists | |
image_path = "imgs/DiGi_Thrace_logo-tall.jpg" | |
if os.path.exists(image_path): | |
st.image(image_path, use_column_width=True) | |
else: | |
st.warning(f"Image '{image_path}' not found.") | |
st.markdown(""" | |
### The Dataset: | |
The dataset is a curated collection of information on ancient geographical locations, rivers, tribes, and cultural aspects as documented by Pliny the Elder in *Naturalis Historia*. It includes lemmas (base forms of words), contextual information, and references to specific books and chapters from Pliny's work. | |
_Measuring Ancient Thrace: Re-evaluating Antiquity in the Digital Age_ | |
**Project no. КП-06-Н50/3 from 30.11.2020, financed by BNSF** | |
""") | |
# File selection | |
csv_files = ["allData.csv", "places.csv", "ethnonyms.csv", "rivers.csv", "mountains.csv", "toponyms.csv"] | |
csv_file = st.selectbox("Select CSV file:", csv_files) | |
# Option to sort entries | |
sort_entries = st.checkbox("Sort entries based on 'Book/Chapter'") | |
# Visualize data based on user selection | |
visualize_data(csv_file, sort_entries=sort_entries) | |
if __name__ == "__main__": | |
main() | |