import streamlit as st import pandas as pd import plotly.express as px import nltk nltk.download('punkt') def count_tokens(text): tokens = nltk.word_tokenize(text) return len(tokens) def extract_number(entry): start_index = entry.find("plin. nat.") + len("plin. nat.") num_str = '' for char in entry[start_index:]: if char.isdigit() or char == '.': num_str += char else: break return float(num_str) if num_str else 0.0 def visualize_data(csv_file, sort_entries=False): # Load data from CSV file data = pd.read_csv(csv_file) # Sorting by the first number after "plin. nat." if specified if sort_entries: data['SortKey'] = data['Book/Chapter'].apply(extract_number) data = data.sort_values(by='SortKey') data['Token_Count'] = data['Context'].apply(count_tokens) # Basic statistics lemma_stats = data.groupby('Lemma').agg({'Context': 'count', 'Token_Count': 'mean'}).reset_index() # Display the basic statistics using st.table() st.write("Basic Statistics:") st.table(lemma_stats) # Bar chart for lemma frequency using Plotly Express fig_bar = px.bar( lemma_stats, x='Lemma', y='Context', color='Lemma', labels={'Context': 'Frequency'}, title='Lemma Frequency in the Dataset' ) # Display the bar chart using st.plotly_chart() st.plotly_chart(fig_bar) # Additional Visualization # Basic statistics for additional data lemma_stats_additional = data['Lemma'].value_counts().reset_index() lemma_stats_additional.columns = ['Lemma', 'Frequency'] # Find the most common lemma most_common_lemma_additional = lemma_stats_additional.iloc[0]['Lemma'] # Distribution across chapters chapter_stats_additional = data.groupby(['Lemma', 'Book/Chapter']).size().unstack(fill_value=0) # Create a pie chart for lemma frequency using Plotly Express fig_pie = px.pie( lemma_stats_additional, values='Frequency', names='Lemma', title='Lemma Frequency Distribution' ) # Display the pie chart using st.plotly_chart() st.plotly_chart(fig_pie) # Create a subplot for chapter-wise lemma mentions fig_additional = px.bar( chapter_stats_additional, barmode='stack', labels={'index': 'Book/Chapter'}, title='Chapter-wise Lemma Mentions' ) # Display the subplot using st.plotly_chart() st.plotly_chart(fig_additional) # Display the most common lemma st.write(f"Most Common Lemma: {most_common_lemma_additional}") # Expandable section to display context with st.expander("Click to view context"): # Display context for each entry for index, row in data.iterrows(): st.write(f"Lemma: {row['Lemma']}") st.write(f"Book/Chapter: {row['Book/Chapter']}") st.write(f"Context: {row['Context']}") st.write('-' * 50) def main(): st.title("Lemma Frequency Visualization") # File selection csv_file = st.sidebar.selectbox("Select CSV file:", ["allData.csv","places.csv","ethonyms.csv","rivers.csv","mountains.csv","toponyms.csv"]) # Option to sort entries sort_entries = st.sidebar.checkbox("Sort entries by plin. nat. number") # Visualization based on selected file and option to sort visualize_data(csv_file, sort_entries) if __name__ == "__main__": main()