bestroi commited on
Commit
a4f6a6d
Β·
verified Β·
1 Parent(s): eef5512

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +447 -0
app.py ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from lxml import etree
3
+ from pathlib import Path
4
+ from io import BytesIO
5
+ from collections import defaultdict
6
+
7
+ NS = {'tei': 'http://www.tei-c.org/ns/1.0'}
8
+
9
+ def parse_xml(file_path):
10
+ """Parses an XML file and returns the tree."""
11
+ try:
12
+ parser = etree.XMLParser(remove_blank_text=True)
13
+ tree = etree.parse(str(file_path), parser)
14
+ return tree
15
+ except Exception as e:
16
+ st.error(f"Error parsing XML file `{file_path.name}`: {e}")
17
+ return None
18
+
19
+ def get_all_authors(parsed_trees):
20
+ """Extracts all unique authors from the list of XML trees."""
21
+ authors = set()
22
+ for tree in parsed_trees:
23
+ # From bibliography
24
+ bib_authors = tree.xpath('//tei:bibl/tei:author/tei:persName', namespaces=NS)
25
+ for author in bib_authors:
26
+ if author.text:
27
+ authors.add(author.text.strip())
28
+ return sorted(authors)
29
+
30
+ def get_all_keywords(parsed_trees):
31
+ """Extracts all unique keywords from the list of XML trees."""
32
+ keywords = set()
33
+ for tree in parsed_trees:
34
+ keyword_items = tree.xpath('//tei:keywords/tei:list/tei:item', namespaces=NS)
35
+ for item in keyword_items:
36
+ if item.text:
37
+ parts = [kw.strip() for kw in item.text.split(',')]
38
+ keywords.update(parts)
39
+ return sorted(keywords)
40
+
41
+ def get_all_place_names(parsed_trees):
42
+ """Extracts all unique place names from the list of XML trees."""
43
+ places = set()
44
+ for tree in parsed_trees:
45
+ provenance_places = tree.xpath('//tei:provenance/tei:placeName', namespaces=NS)
46
+ for place in provenance_places:
47
+ if place.text and place.text.lower() != 'none':
48
+ places.add(place.text.strip())
49
+
50
+ location_names = tree.xpath('//tei:location/tei:name[@type="place"]', namespaces=NS)
51
+ for name in location_names:
52
+ if name.text and name.text.lower() != 'none':
53
+ places.add(name.text.strip())
54
+
55
+ contemporary_names = tree.xpath('//tei:div[@type="commentary"]//tei:name[@type="contemporary"]', namespaces=NS)
56
+ for name in contemporary_names:
57
+ if name.text and name.text.lower() != 'none':
58
+ places.add(name.text.strip())
59
+
60
+ current_names = tree.xpath('//tei:name[@type="current"]', namespaces=NS)
61
+ for name in current_names:
62
+ if name.text and name.text.lower() != 'none':
63
+ places.add(name.text.strip())
64
+ return sorted(places)
65
+
66
+ def build_author_mappings(parsed_trees, xml_files):
67
+ """
68
+ Builds mappings from authors to their associated places and keywords.
69
+
70
+ Returns:
71
+ author_to_places (dict): Maps each author to a set of associated places.
72
+ author_to_keywords (dict): Maps each author to a set of associated keywords.
73
+ """
74
+ author_to_places = defaultdict(set)
75
+ author_to_keywords = defaultdict(set)
76
+
77
+ for tree in parsed_trees:
78
+ # Extract authors
79
+ authors = set()
80
+ bib_authors = tree.xpath('//tei:bibl/tei:author/tei:persName', namespaces=NS)
81
+ for author in bib_authors:
82
+ if author.text:
83
+ authors.add(author.text.strip())
84
+
85
+ # Extract places
86
+ places = set()
87
+ provenance_places = tree.xpath('//tei:provenance/tei:placeName', namespaces=NS)
88
+ for place in provenance_places:
89
+ if place.text and place.text.lower() != 'none':
90
+ places.add(place.text.strip())
91
+
92
+ location_names = tree.xpath('//tei:location/tei:name[@type="place"]', namespaces=NS)
93
+ for name in location_names:
94
+ if name.text and name.text.lower() != 'none':
95
+ places.add(name.text.strip())
96
+
97
+ contemporary_names = tree.xpath('//tei:div[@type="commentary"]//tei:name[@type="contemporary"]', namespaces=NS)
98
+ for name in contemporary_names:
99
+ if name.text and name.text.lower() != 'none':
100
+ places.add(name.text.strip())
101
+
102
+ current_names = tree.xpath('//tei:name[@type="current"]', namespaces=NS)
103
+ for name in current_names:
104
+ if name.text and name.text.lower() != 'none':
105
+ places.add(name.text.strip())
106
+
107
+ keywords = set()
108
+ keyword_items = tree.xpath('//tei:keywords/tei:list/tei:item', namespaces=NS)
109
+ for item in keyword_items:
110
+ if item.text:
111
+ parts = [kw.strip() for kw in item.text.split(',')]
112
+ keywords.update(parts)
113
+
114
+ for author in authors:
115
+ author_to_places[author].update(places)
116
+ author_to_keywords[author].update(keywords)
117
+
118
+ return author_to_places, author_to_keywords
119
+
120
+ def get_commentary(tree):
121
+ """Extracts commentary sections from a single XML tree."""
122
+ commentaries = tree.xpath('//tei:div[@type="commentary"]', namespaces=NS)
123
+ commentary_list = []
124
+ for comm in commentaries:
125
+ subtype = comm.get('subtype', 'general')
126
+ content = etree.tostring(comm, pretty_print=True, encoding='unicode')
127
+ commentary_list.append({'subtype': subtype, 'content': content})
128
+ return commentary_list
129
+
130
+ def get_editions(tree):
131
+ """Extracts edition sections from a single XML tree."""
132
+ editions = tree.xpath('//tei:div[@type="edition"]', namespaces=NS)
133
+ edition_list = []
134
+ for edition in editions:
135
+
136
+ lang = edition.get('{http://www.w3.org/XML/1998/namespace}lang', 'unknown')
137
+ content = etree.tostring(edition, pretty_print=True, encoding='unicode')
138
+ edition_list.append({'lang': lang, 'content': content})
139
+ return edition_list
140
+
141
+ def search_by_author(tree, author_query):
142
+ """Searches for the author in titleStmt and bibliography."""
143
+ results = []
144
+
145
+ bib_authors = tree.xpath('//tei:bibl/tei:author/tei:persName', namespaces=NS)
146
+ for author in bib_authors:
147
+ if author.text and author_query.lower() in author.text.lower():
148
+ results.append(f"Bibliography Author: {author.text}")
149
+ return results
150
+
151
+ def search_by_place(tree, place_query):
152
+ """
153
+ Searches for the place in provenance, contemporary names, and location geo elements.
154
+
155
+ Parameters:
156
+ tree (etree.Element): Parsed XML tree.
157
+ place_query (str): The place name to search for.
158
+
159
+ Returns:
160
+ list: A list of strings describing where the place was found.
161
+ """
162
+ results = []
163
+ place_query_lower = place_query.lower()
164
+
165
+ provenance_places = tree.xpath('//tei:provenance/tei:placeName', namespaces=NS)
166
+ for place in provenance_places:
167
+ if place.text and place_query_lower in place.text.lower() and place.text.lower() != "none":
168
+ results.append(f"Provenance Place: {place.text.strip()}")
169
+
170
+ contemporary_names = tree.xpath(
171
+ '//tei:div[@type="commentary" and @subtype="general"]//tei:name[@type="contemporary"]',
172
+ namespaces=NS
173
+ )
174
+ for name in contemporary_names:
175
+ if name.text and place_query_lower in name.text.lower():
176
+ results.append(f"Contemporary Name: {name.text.strip()}")
177
+
178
+ geo_elements = tree.xpath('//tei:location//tei:geo', namespaces=NS)
179
+ for geo in geo_elements:
180
+ if geo.text and place_query_lower in geo.text.lower() and geo.text.lower() != "none":
181
+ results.append(f"Location Geo: {geo.text.strip()}")
182
+
183
+ return results
184
+
185
+ def search_by_keyword(tree, keyword):
186
+ """Searches for the keyword in keywords and commentary segments."""
187
+ results = []
188
+ keyword_items = tree.xpath('//tei:keywords/tei:list/tei:item', namespaces=NS)
189
+ for item in keyword_items:
190
+ if item.text and keyword.lower() in item.text.lower():
191
+ results.append(f"Keyword: {item.text}")
192
+ commentary_segs = tree.xpath('//tei:div[@type="commentary"]//tei:seg', namespaces=NS)
193
+ for seg in commentary_segs:
194
+ if seg.text and keyword.lower() in seg.text.lower():
195
+ results.append(f"Commentary Segment: {seg.text}")
196
+ return results
197
+
198
+ def display_tei_header(tree):
199
+ title = tree.xpath('//tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title', namespaces=NS)
200
+ author = tree.xpath('//tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:author/tei:persName', namespaces=NS)
201
+ publication = tree.xpath('//tei:teiHeader/tei:fileDesc/tei:publicationStmt/tei:publisher', namespaces=NS)
202
+ date = tree.xpath('//tei:teiHeader/tei:fileDesc/tei:publicationStmt/tei:date', namespaces=NS)
203
+
204
+ if title:
205
+ st.write(f"**Title:** {title[0].text}")
206
+ if author:
207
+ st.write(f"**Author:** {author[0].text}")
208
+ if publication:
209
+ st.write(f"**Publisher:** {publication[0].text}")
210
+ if date:
211
+ st.write(f"**Date:** {date[0].text}")
212
+
213
+ def display_code_wrapped(content):
214
+ """
215
+ Custom function to display code with wrapping using st.markdown and HTML.
216
+ This avoids horizontal scrolling by wrapping long lines.
217
+ """
218
+ st.markdown(
219
+ f"""
220
+ <div style="white-space: pre-wrap; word-wrap: break-word; font-size:14px; background-color: #f5f5f5; padding: 10px; border-radius: 5px; overflow: hidden;">
221
+ <code>{content}</code>
222
+ </div>
223
+ """,
224
+ unsafe_allow_html=True
225
+ )
226
+
227
+ def main():
228
+ st.set_page_config(page_title="DigitalSEE TEI XML Viewer", layout="wide")
229
+
230
+ st.markdown(
231
+ """
232
+ <style>
233
+ /* Enable code wrapping in st.code blocks */
234
+ pre, code {
235
+ white-space: pre-wrap !important; /* Allows wrapping */
236
+ word-wrap: break-word !important; /* Breaks long words */
237
+ overflow-x: hidden !important; /* Hides horizontal scrollbar */
238
+ }
239
+ /* Adjust font size for better fit */
240
+ .streamlit-expanderHeader, pre, code {
241
+ font-size: 14px !important;
242
+ }
243
+ /* Ensure the container doesn't force a minimum width */
244
+ .streamlit-expander, .block-container {
245
+ max-width: 100% !important;
246
+ }
247
+ /* Optional: Style for the code background */
248
+ pre {
249
+ background-color: #f5f5f5 !important;
250
+ padding: 10px !important;
251
+ border-radius: 5px !important;
252
+ }
253
+ </style>
254
+ """,
255
+ unsafe_allow_html=True
256
+ )
257
+
258
+ st.title("πŸ“„ DigitalSEE TEI XML Viewer")
259
+
260
+
261
+ xml_folder = Path("./xmls")
262
+
263
+ if not xml_folder.exists() or not xml_folder.is_dir():
264
+ st.error(f"The specified folder `{xml_folder}` does not exist or is not a directory.")
265
+ st.stop()
266
+
267
+ xml_files = list(xml_folder.glob("*.xml"))
268
+ if not xml_files:
269
+ st.info(f"No XML files found in the folder `{xml_folder}`.")
270
+ st.stop()
271
+
272
+ st.sidebar.header("πŸ“‚ XML Files Overview")
273
+ st.sidebar.write(f"**Total XML Files Loaded:** {len(xml_files)}")
274
+
275
+ parsed_trees = []
276
+ valid_files = []
277
+ for file in xml_files:
278
+ tree = parse_xml(file)
279
+ if tree is not None:
280
+ parsed_trees.append(tree)
281
+ valid_files.append(file)
282
+
283
+ if not parsed_trees:
284
+ st.error("No valid XML files were parsed successfully.")
285
+ st.stop()
286
+
287
+ all_authors = get_all_authors(parsed_trees)
288
+ all_keywords = get_all_keywords(parsed_trees)
289
+ all_place_names = get_all_place_names(parsed_trees)
290
+
291
+ author_to_places, author_to_keywords = build_author_mappings(parsed_trees, valid_files)
292
+
293
+ st.header("πŸ” Search TEI XML Files")
294
+
295
+ search_col1, search_col2, search_col3 = st.columns(3)
296
+
297
+ with search_col1:
298
+ st.markdown("**Search by Author**")
299
+ selected_author = st.selectbox("Select Author", options=["-- Select Author --"] + all_authors, key="author_select")
300
+
301
+ if selected_author != "-- Select Author --":
302
+ filtered_places = sorted(author_to_places[selected_author])
303
+ filtered_keywords = sorted(author_to_keywords[selected_author])
304
+ else:
305
+ filtered_places = all_place_names
306
+ filtered_keywords = all_keywords
307
+
308
+ with search_col2:
309
+ st.markdown("**Search by Place Name**")
310
+ selected_place = st.selectbox("Select Place", options=["-- Select Place --"] + filtered_places, key="place_select")
311
+
312
+ with search_col3:
313
+ st.markdown("**Search by Keyword**")
314
+ selected_keyword = st.selectbox("Select Keyword", options=["-- Select Keyword --"] + filtered_keywords, key="keyword_select")
315
+
316
+
317
+ if st.button("πŸ”Ž Search"):
318
+ st.subheader("πŸ”— Search Results")
319
+
320
+ matched_files = set(valid_files)
321
+
322
+
323
+ if selected_author != "-- Select Author --":
324
+ author_matched = set()
325
+ for tree, file in zip(parsed_trees, valid_files):
326
+ if search_by_author(tree, selected_author):
327
+ author_matched.add(file)
328
+ matched_files = matched_files.intersection(author_matched)
329
+
330
+
331
+ if selected_place != "-- Select Place --":
332
+ place_matched = set()
333
+ for tree, file in zip(parsed_trees, valid_files):
334
+ if search_by_place(tree, selected_place):
335
+ place_matched.add(file)
336
+ matched_files = matched_files.intersection(place_matched)
337
+
338
+
339
+ if selected_keyword != "-- Select Keyword --":
340
+ keyword_matched = set()
341
+ for tree, file in zip(parsed_trees, valid_files):
342
+ if search_by_keyword(tree, selected_keyword):
343
+ keyword_matched.add(file)
344
+ matched_files = matched_files.intersection(keyword_matched)
345
+
346
+ if matched_files:
347
+ st.write(f"**Total Matches:** {len(matched_files)}")
348
+ for file in matched_files:
349
+ tree = parse_xml(file)
350
+ if tree is not None:
351
+ with st.expander(f"πŸ“„ {file.name}"):
352
+ display_tei_header(tree)
353
+
354
+ commentaries = get_commentary(tree)
355
+ if commentaries:
356
+ st.markdown("**Commentary Sections:**")
357
+ for idx, comm in enumerate(commentaries, start=1):
358
+ st.markdown(f"**Commentary {idx} - {comm['subtype']}**")
359
+ st.code(comm['content'], language='xml')
360
+
361
+ else:
362
+ st.write("No commentary sections found.")
363
+
364
+ editions = get_editions(tree)
365
+ if editions:
366
+ st.markdown("**Edition Sections:**")
367
+ for idx, edition in enumerate(editions, start=1):
368
+ st.markdown(f"**Edition {idx} - Language: {edition['lang']}**")
369
+ st.code(edition['content'], language='xml')
370
+
371
+ else:
372
+ st.write("No edition sections found.")
373
+
374
+ associated_places = sorted(author_to_places.get(selected_author, set())) if selected_author != "-- Select Author --" else sorted(get_all_place_names([tree]))
375
+ associated_keywords = sorted(author_to_keywords.get(selected_author, set())) if selected_author != "-- Select Author --" else sorted(get_all_keywords([tree]))
376
+
377
+ if associated_places:
378
+ st.markdown("**Associated Places:**")
379
+ st.write(", ".join(associated_places))
380
+ if associated_keywords:
381
+ st.markdown("**Associated Keywords:**")
382
+ st.write(", ".join(associated_keywords))
383
+
384
+
385
+ buffer = BytesIO()
386
+ tree.write(buffer, pretty_print=True, encoding='utf-8', xml_declaration=True)
387
+ buffer.seek(0)
388
+ st.download_button(
389
+ label="πŸ“₯ Download XML",
390
+ data=buffer,
391
+ file_name=f"matched_{file.name}",
392
+ mime="application/xml"
393
+ )
394
+ else:
395
+ st.write("No matching files found for the given search criteria.")
396
+
397
+ with st.expander("πŸ“š View All Loaded XML Files"):
398
+ for tree, file in zip(parsed_trees, valid_files):
399
+ with st.container():
400
+ st.markdown(f"### πŸ“„ {file.name}")
401
+ display_tei_header(tree)
402
+
403
+ commentaries = get_commentary(tree)
404
+ if commentaries:
405
+ st.markdown("**Commentary Sections:**")
406
+ for idx, comm in enumerate(commentaries, start=1):
407
+ st.markdown(f"**Commentary {idx} - {comm['subtype']}**")
408
+ st.code(comm['content'], language='xml')
409
+
410
+ else:
411
+ st.write("No commentary sections found.")
412
+
413
+ editions = get_editions(tree)
414
+ if editions:
415
+ st.markdown("**Edition Sections:**")
416
+ for idx, edition in enumerate(editions, start=1):
417
+ st.markdown(f"**Edition {idx} - Language: {edition['lang']}**")
418
+ st.code(edition['content'], language='xml')
419
+
420
+ else:
421
+ st.write("No edition sections found.")
422
+
423
+ st.sidebar.markdown("---")
424
+ st.sidebar.header("Simple Querying Interface")
425
+ st.sidebar.write(
426
+ "Quickly search and filter TEI XML files to find relevant information or themes."
427
+ )
428
+
429
+ st.sidebar.header("XML Code Viewer")
430
+ st.sidebar.write(
431
+ "View detailed XML code for commentaries and editions in their original format."
432
+ )
433
+
434
+ st.sidebar.header("Downloadable Entries")
435
+ st.sidebar.write(
436
+ "Download entries for offline access and further analysis."
437
+ )
438
+
439
+ st.sidebar.header("Comprehensive Meta Information")
440
+ st.sidebar.write(
441
+ "Each entry includes rich metadata, such as XML file author details."
442
+ )
443
+
444
+
445
+
446
+ if __name__ == "__main__":
447
+ main()