bestroi commited on
Commit
bcd03e9
·
verified ·
1 Parent(s): d262f97

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -54
app.py CHANGED
@@ -2,96 +2,160 @@ import streamlit as st
2
  import pandas as pd
3
  import plotly.express as px
4
  import nltk
 
 
5
 
6
- # Download punkt tokenizer forcefully to avoid LookupError
7
- nltk.download('punkt', force=True)
8
 
9
  def count_tokens(text):
10
- tokens = nltk.word_tokenize(text)
11
- return len(tokens)
 
 
 
12
 
13
  def extract_number(entry):
14
- start_index = entry.find("plin. nat.") + len("plin. nat.")
 
 
 
 
 
 
 
 
15
  num_str = ''
16
  for char in entry[start_index:]:
17
  if char.isdigit() or char == '.':
18
  num_str += char
19
  else:
20
  break
21
- return float(num_str) if num_str else 0.0
22
-
23
- def visualize_data(csv_file, sort_entries=False):
24
- data = pd.read_csv(csv_file)
 
 
 
 
 
 
 
 
 
 
 
 
25
 
 
 
 
 
 
 
26
  if sort_entries:
27
  data['SortKey'] = data['Book/Chapter'].apply(extract_number)
28
  data = data.sort_values(by='SortKey')
29
-
 
30
  data['token_count'] = data['Context'].apply(count_tokens)
31
-
32
- lemma_stats = data.groupby('Lemma').agg({'Context': 'count', 'token_count': 'mean'}).reset_index()
33
-
34
- st.write("Basic Statistics:")
 
 
 
 
 
35
  st.table(lemma_stats)
36
-
 
37
  fig_bar = px.bar(
38
  lemma_stats,
39
  x='Lemma',
40
- y='Context',
41
  color='Lemma',
42
- labels={'Context': 'Frequency'},
43
  title='Lemma Frequency in the Dataset'
44
  )
45
-
46
- st.plotly_chart(fig_bar)
47
- lemma_stats_additional = data['Lemma'].value_counts().reset_index()
48
- lemma_stats_additional.columns = ['Lemma', 'Frequency']
49
-
50
- most_common_lemma_additional = lemma_stats_additional.iloc[0]['Lemma']
51
-
52
- chapter_stats_additional = data.groupby(['Lemma', 'Book/Chapter']).size().unstack(fill_value=0)
 
 
 
 
53
  fig_pie = px.pie(
54
- lemma_stats_additional,
55
  values='Frequency',
56
  names='Lemma',
57
- title='Lemma Frequency Distribution'
58
  )
59
-
60
- st.plotly_chart(fig_pie)
61
- fig_additional = px.bar(
62
- chapter_stats_additional,
 
 
 
 
63
  barmode='stack',
64
- labels={'index': 'Book/Chapter'},
65
  title='Chapter-wise Lemma Mentions'
66
  )
67
-
68
- st.plotly_chart(fig_additional)
69
- st.write(f"Most Common Lemma: {most_common_lemma_additional}")
70
-
71
- with st.expander("Click to view context"):
 
 
 
72
  for index, row in data.iterrows():
73
- st.write(f"Lemma: {row['Lemma']}")
74
- st.write(f"Book/Chapter: {row['Book/Chapter']}")
75
- st.write(f"Context: {row['Context']}")
76
- st.write('-' * 50)
77
 
78
  def main():
 
 
79
  st.title("Lemma Frequency Visualization")
80
-
81
- # Sidebar section
82
- st.sidebar.image("imgs/DiGi_Thrace logo-tall.jpg", use_column_width=True)
83
- st.sidebar.markdown("""
84
- ### The Dataset:
85
- The dataset is a curated collection of information on ancient geographical locations, rivers, tribes, and cultural aspects as documented by Pliny the Elder in *Naturalis Historia*. It includes lemmas (base forms of words), contextual information, and references to specific books and chapters from Pliny's work.
86
 
87
- _Measuring Ancient Thrace: Re-evaluating Antiquity in Digital Age_
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
- **Project no. КП-06-Н50/3 from 30.11.2020, financed by BNSF**
90
- """, unsafe_allow_html=True)
91
-
92
- csv_file = st.sidebar.selectbox("Select CSV file:", ["allData.csv","places.csv","ethnonyms.csv","rivers.csv","mountains.csv","toponyms.csv"])
93
-
94
- visualize_data(csv_file)
95
 
96
  if __name__ == "__main__":
97
  main()
 
2
  import pandas as pd
3
  import plotly.express as px
4
  import nltk
5
+ from nltk.tokenize import word_tokenize
6
+ import os
7
 
8
+ # Ensure NLTK 'punkt' tokenizer is downloaded
9
+ nltk.download('punkt', quiet=True)
10
 
11
  def count_tokens(text):
12
+ """Count the number of tokens in a given text."""
13
+ if isinstance(text, str):
14
+ tokens = word_tokenize(text)
15
+ return len(tokens)
16
+ return 0
17
 
18
  def extract_number(entry):
19
+ """
20
+ Extracts a floating-point number following the substring "plin. nat." in the entry.
21
+ Returns 0.0 if the pattern is not found or conversion fails.
22
+ """
23
+ search_str = "plin. nat."
24
+ start_index = entry.find(search_str)
25
+ if start_index == -1:
26
+ return 0.0
27
+ start_index += len(search_str)
28
  num_str = ''
29
  for char in entry[start_index:]:
30
  if char.isdigit() or char == '.':
31
  num_str += char
32
  else:
33
  break
34
+ try:
35
+ return float(num_str) if num_str else 0.0
36
+ except ValueError:
37
+ return 0.0
38
+
39
+ def visualize_data(csv_file, sort_entries=False):
40
+ """Reads the CSV file, processes data, and visualizes it using Streamlit."""
41
+ if not os.path.exists(csv_file):
42
+ st.error(f"The file '{csv_file}' does not exist. Please check the file path.")
43
+ return
44
+
45
+ try:
46
+ data = pd.read_csv(csv_file)
47
+ except Exception as e:
48
+ st.error(f"Error reading '{csv_file}': {e}")
49
+ return
50
 
51
+ # Check for necessary columns
52
+ required_columns = {'Book/Chapter', 'Context', 'Lemma'}
53
+ if not required_columns.issubset(data.columns):
54
+ st.error(f"The CSV file must contain the following columns: {required_columns}")
55
+ return
56
+
57
  if sort_entries:
58
  data['SortKey'] = data['Book/Chapter'].apply(extract_number)
59
  data = data.sort_values(by='SortKey')
60
+ data.drop('SortKey', axis=1, inplace=True)
61
+
62
  data['token_count'] = data['Context'].apply(count_tokens)
63
+
64
+ # Group by 'Lemma' to get frequency and average token count
65
+ lemma_stats = data.groupby('Lemma').agg({
66
+ 'Context': 'count',
67
+ 'token_count': 'mean'
68
+ }).reset_index()
69
+ lemma_stats.rename(columns={'Context': 'Frequency', 'token_count': 'Average Token Count'}, inplace=True)
70
+
71
+ st.subheader("Basic Statistics")
72
  st.table(lemma_stats)
73
+
74
+ # Bar Chart: Lemma Frequency
75
  fig_bar = px.bar(
76
  lemma_stats,
77
  x='Lemma',
78
+ y='Frequency',
79
  color='Lemma',
80
+ labels={'Frequency': 'Frequency'},
81
  title='Lemma Frequency in the Dataset'
82
  )
83
+ st.plotly_chart(fig_bar)
84
+
85
+ # Pie Chart: Lemma Frequency Distribution
86
+ # To avoid clutter, show top 10 lemmas and aggregate the rest
87
+ top_n = 10
88
+ top_lemmas = lemma_stats.nlargest(top_n, 'Frequency')
89
+ others = lemma_stats['Frequency'].sum() - top_lemmas['Frequency'].sum()
90
+ pie_data = top_lemmas.append(pd.DataFrame({
91
+ 'Lemma': ['Others'],
92
+ 'Frequency': [others]
93
+ }), ignore_index=True)
94
+
95
  fig_pie = px.pie(
96
+ pie_data,
97
  values='Frequency',
98
  names='Lemma',
99
+ title='Lemma Frequency Distribution (Top 10)'
100
  )
101
+ st.plotly_chart(fig_pie)
102
+
103
+ # Chapter-wise Lemma Mentions
104
+ chapter_stats = data.groupby(['Lemma', 'Book/Chapter']).size().reset_index(name='Count')
105
+ chapter_pivot = chapter_stats.pivot(index='Book/Chapter', columns='Lemma', values='Count').fillna(0)
106
+
107
+ fig_chapter = px.bar(
108
+ chapter_pivot,
109
  barmode='stack',
110
+ labels={'index': 'Book/Chapter', 'value': 'Count'},
111
  title='Chapter-wise Lemma Mentions'
112
  )
113
+ st.plotly_chart(fig_chapter)
114
+
115
+ # Most Common Lemma
116
+ most_common_lemma = lemma_stats.loc[lemma_stats['Frequency'].idxmax()]['Lemma']
117
+ st.write(f"**Most Common Lemma:** {most_common_lemma}")
118
+
119
+ # Expander to show detailed context
120
+ with st.expander("View Detailed Contexts"):
121
  for index, row in data.iterrows():
122
+ st.markdown(f"**Lemma:** {row['Lemma']}")
123
+ st.markdown(f"**Book/Chapter:** {row['Book/Chapter']}")
124
+ st.markdown(f"**Context:** {row['Context']}")
125
+ st.markdown("---")
126
 
127
  def main():
128
+ """Main function to set up the Streamlit app."""
129
+ st.set_page_config(page_title="Lemma Frequency Visualization", layout="wide")
130
  st.title("Lemma Frequency Visualization")
 
 
 
 
 
 
131
 
132
+ # Sidebar configuration
133
+ with st.sidebar:
134
+ # Display image if it exists
135
+ image_path = "imgs/DiGi_Thrace_logo-tall.jpg"
136
+ if os.path.exists(image_path):
137
+ st.image(image_path, use_column_width=True)
138
+ else:
139
+ st.warning(f"Image '{image_path}' not found.")
140
+
141
+ st.markdown("""
142
+ ### The Dataset:
143
+ The dataset is a curated collection of information on ancient geographical locations, rivers, tribes, and cultural aspects as documented by Pliny the Elder in *Naturalis Historia*. It includes lemmas (base forms of words), contextual information, and references to specific books and chapters from Pliny's work.
144
+
145
+ _Measuring Ancient Thrace: Re-evaluating Antiquity in the Digital Age_
146
+
147
+ **Project no. КП-06-Н50/3 from 30.11.2020, financed by BNSF**
148
+ """)
149
+
150
+ # File selection
151
+ csv_files = ["allData.csv", "places.csv", "ethnonyms.csv", "rivers.csv", "mountains.csv", "toponyms.csv"]
152
+ csv_file = st.selectbox("Select CSV file:", csv_files)
153
+
154
+ # Option to sort entries
155
+ sort_entries = st.checkbox("Sort entries based on 'Book/Chapter'")
156
 
157
+ # Visualize data based on user selection
158
+ visualize_data(csv_file, sort_entries=sort_entries)
 
 
 
 
159
 
160
  if __name__ == "__main__":
161
  main()