Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import plotly.express as px | |
import nltk | |
nltk.download('punkt') | |
def count_tokens(text): | |
tokens = nltk.word_tokenize(text) | |
return len(tokens) | |
def extract_number(entry): | |
start_index = entry.find("plin. nat.") + len("plin. nat.") | |
num_str = '' | |
for char in entry[start_index:]: | |
if char.isdigit() or char == '.': | |
num_str += char | |
else: | |
break | |
return float(num_str) if num_str else 0.0 | |
def visualize_data(csv_file, sort_entries=False): | |
# Load data from CSV file | |
data = pd.read_csv(csv_file) | |
# Sorting by the first number after "plin. nat." if specified | |
if sort_entries: | |
data['SortKey'] = data['Book/Chapter'].apply(extract_number) | |
data = data.sort_values(by='SortKey') | |
data['Token_Count'] = data['Context'].apply(count_tokens) | |
# Basic statistics | |
lemma_stats = data.groupby('Lemma').agg({'Context': 'count', 'Token_Count': 'mean'}).reset_index() | |
# Display the basic statistics using st.table() | |
st.write("Basic Statistics:") | |
st.table(lemma_stats) | |
# Bar chart for lemma frequency using Plotly Express | |
fig_bar = px.bar( | |
lemma_stats, | |
x='Lemma', | |
y='Context', | |
color='Lemma', | |
labels={'Context': 'Frequency'}, | |
title='Lemma Frequency in the Dataset' | |
) | |
# Display the bar chart using st.plotly_chart() | |
st.plotly_chart(fig_bar) | |
# Additional Visualization | |
# Basic statistics for additional data | |
lemma_stats_additional = data['Lemma'].value_counts().reset_index() | |
lemma_stats_additional.columns = ['Lemma', 'Frequency'] | |
# Find the most common lemma | |
most_common_lemma_additional = lemma_stats_additional.iloc[0]['Lemma'] | |
# Distribution across chapters | |
chapter_stats_additional = data.groupby(['Lemma', 'Book/Chapter']).size().unstack(fill_value=0) | |
# Create a pie chart for lemma frequency using Plotly Express | |
fig_pie = px.pie( | |
lemma_stats_additional, | |
values='Frequency', | |
names='Lemma', | |
title='Lemma Frequency Distribution' | |
) | |
# Display the pie chart using st.plotly_chart() | |
st.plotly_chart(fig_pie) | |
# Create a subplot for chapter-wise lemma mentions | |
fig_additional = px.bar( | |
chapter_stats_additional, | |
barmode='stack', | |
labels={'index': 'Book/Chapter'}, | |
title='Chapter-wise Lemma Mentions' | |
) | |
# Display the subplot using st.plotly_chart() | |
st.plotly_chart(fig_additional) | |
# Display the most common lemma | |
st.write(f"Most Common Lemma: {most_common_lemma_additional}") | |
# Expandable section to display context | |
with st.expander("Click to view context"): | |
# Display context for each entry | |
for index, row in data.iterrows(): | |
st.write(f"Lemma: {row['Lemma']}") | |
st.write(f"Book/Chapter: {row['Book/Chapter']}") | |
st.write(f"Context: {row['Context']}") | |
st.write('-' * 50) | |
def main(): | |
st.title("Lemma Frequency Visualization") | |
# File selection | |
csv_file = st.sidebar.selectbox("Select CSV file:", ["allData.csv","places.csv","ethonyms.csv","rivers.csv","mountains.csv","toponyms.csv"]) | |
# Option to sort entries | |
sort_entries = st.sidebar.checkbox("Sort entries by plin. nat. number") | |
# Visualization based on selected file and option to sort | |
visualize_data(csv_file, sort_entries) | |
if __name__ == "__main__": | |
main() |