Spaces:

bestroi
/

PliniusNatHist

Running

App Files Files Community

PliniusNatHist / app.py

bestroi

Update app.py

bcd03e9 verified 7 months ago

raw

history blame

5.68 kB

	import streamlit as st
	import pandas as pd
	import plotly.express as px
	import nltk
	from nltk.tokenize import word_tokenize
	import os

	# Ensure NLTK 'punkt' tokenizer is downloaded
	nltk.download('punkt', quiet=True)

	def count_tokens(text):
	"""Count the number of tokens in a given text."""
	if isinstance(text, str):
	tokens = word_tokenize(text)
	return len(tokens)
	return 0

	def extract_number(entry):
	"""
	Extracts a floating-point number following the substring "plin. nat." in the entry.
	Returns 0.0 if the pattern is not found or conversion fails.
	"""
	search_str = "plin. nat."
	start_index = entry.find(search_str)
	if start_index == -1:
	return 0.0
	start_index += len(search_str)
	num_str = ''
	for char in entry[start_index:]:
	if char.isdigit() or char == '.':
	num_str += char
	else:
	break
	try:
	return float(num_str) if num_str else 0.0
	except ValueError:
	return 0.0

	def visualize_data(csv_file, sort_entries=False):
	"""Reads the CSV file, processes data, and visualizes it using Streamlit."""
	if not os.path.exists(csv_file):
	st.error(f"The file '{csv_file}' does not exist. Please check the file path.")
	return

	try:
	data = pd.read_csv(csv_file)
	except Exception as e:
	st.error(f"Error reading '{csv_file}': {e}")
	return

	# Check for necessary columns
	required_columns = {'Book/Chapter', 'Context', 'Lemma'}
	if not required_columns.issubset(data.columns):
	st.error(f"The CSV file must contain the following columns: {required_columns}")
	return

	if sort_entries:
	data['SortKey'] = data['Book/Chapter'].apply(extract_number)
	data = data.sort_values(by='SortKey')
	data.drop('SortKey', axis=1, inplace=True)

	data['token_count'] = data['Context'].apply(count_tokens)

	# Group by 'Lemma' to get frequency and average token count
	lemma_stats = data.groupby('Lemma').agg({
	'Context': 'count',
	'token_count': 'mean'
	}).reset_index()
	lemma_stats.rename(columns={'Context': 'Frequency', 'token_count': 'Average Token Count'}, inplace=True)

	st.subheader("Basic Statistics")
	st.table(lemma_stats)

	# Bar Chart: Lemma Frequency
	fig_bar = px.bar(
	lemma_stats,
	x='Lemma',
	y='Frequency',
	color='Lemma',
	labels={'Frequency': 'Frequency'},
	title='Lemma Frequency in the Dataset'
	)
	st.plotly_chart(fig_bar)

	# Pie Chart: Lemma Frequency Distribution
	# To avoid clutter, show top 10 lemmas and aggregate the rest
	top_n = 10
	top_lemmas = lemma_stats.nlargest(top_n, 'Frequency')
	others = lemma_stats['Frequency'].sum() - top_lemmas['Frequency'].sum()
	pie_data = top_lemmas.append(pd.DataFrame({
	'Lemma': ['Others'],
	'Frequency': [others]
	}), ignore_index=True)

	fig_pie = px.pie(
	pie_data,
	values='Frequency',
	names='Lemma',
	title='Lemma Frequency Distribution (Top 10)'
	)
	st.plotly_chart(fig_pie)

	# Chapter-wise Lemma Mentions
	chapter_stats = data.groupby(['Lemma', 'Book/Chapter']).size().reset_index(name='Count')
	chapter_pivot = chapter_stats.pivot(index='Book/Chapter', columns='Lemma', values='Count').fillna(0)

	fig_chapter = px.bar(
	chapter_pivot,
	barmode='stack',
	labels={'index': 'Book/Chapter', 'value': 'Count'},
	title='Chapter-wise Lemma Mentions'
	)
	st.plotly_chart(fig_chapter)

	# Most Common Lemma
	most_common_lemma = lemma_stats.loc[lemma_stats['Frequency'].idxmax()]['Lemma']
	st.write(f"Most Common Lemma: {most_common_lemma}")

	# Expander to show detailed context
	with st.expander("View Detailed Contexts"):
	for index, row in data.iterrows():
	st.markdown(f"Lemma: {row['Lemma']}")
	st.markdown(f"Book/Chapter: {row['Book/Chapter']}")
	st.markdown(f"Context: {row['Context']}")
	st.markdown("---")

	def main():
	"""Main function to set up the Streamlit app."""
	st.set_page_config(page_title="Lemma Frequency Visualization", layout="wide")
	st.title("Lemma Frequency Visualization")

	# Sidebar configuration
	with st.sidebar:
	# Display image if it exists
	image_path = "imgs/DiGi_Thrace_logo-tall.jpg"
	if os.path.exists(image_path):
	st.image(image_path, use_column_width=True)
	else:
	st.warning(f"Image '{image_path}' not found.")

	st.markdown("""
	### The Dataset:
	The dataset is a curated collection of information on ancient geographical locations, rivers, tribes, and cultural aspects as documented by Pliny the Elder in Naturalis Historia. It includes lemmas (base forms of words), contextual information, and references to specific books and chapters from Pliny's work.

	_Measuring Ancient Thrace: Re-evaluating Antiquity in the Digital Age_

	Project no. КП-06-Н50/3 from 30.11.2020, financed by BNSF
	""")

	# File selection
	csv_files = ["allData.csv", "places.csv", "ethnonyms.csv", "rivers.csv", "mountains.csv", "toponyms.csv"]
	csv_file = st.selectbox("Select CSV file:", csv_files)

	# Option to sort entries
	sort_entries = st.checkbox("Sort entries based on 'Book/Chapter'")

	# Visualize data based on user selection
	visualize_data(csv_file, sort_entries=sort_entries)

	if __name__ == "__main__":
	main()