Spaces:
Sleeping
Sleeping
import streamlit as st | |
from lxml import etree | |
from pathlib import Path | |
from io import BytesIO | |
from collections import defaultdict | |
NS = {'tei': 'http://www.tei-c.org/ns/1.0'} | |
def parse_xml(file_path): | |
"""Parses an XML file and returns the tree.""" | |
try: | |
parser = etree.XMLParser(remove_blank_text=True) | |
tree = etree.parse(str(file_path), parser) | |
return tree | |
except Exception as e: | |
st.error(f"Error parsing XML file `{file_path.name}`: {e}") | |
return None | |
def get_all_authors(parsed_trees): | |
"""Extracts all unique authors from the list of XML trees.""" | |
authors = set() | |
for tree in parsed_trees: | |
# From bibliography | |
bib_authors = tree.xpath('//tei:bibl/tei:author/tei:persName', namespaces=NS) | |
for author in bib_authors: | |
if author.text: | |
authors.add(author.text.strip()) | |
return sorted(authors) | |
def get_all_keywords(parsed_trees): | |
"""Extracts all unique keywords from the list of XML trees.""" | |
keywords = set() | |
for tree in parsed_trees: | |
keyword_items = tree.xpath('//tei:keywords/tei:list/tei:item', namespaces=NS) | |
for item in keyword_items: | |
if item.text: | |
parts = [kw.strip() for kw in item.text.split(',')] | |
keywords.update(parts) | |
return sorted(keywords) | |
def get_all_place_names(parsed_trees): | |
"""Extracts all unique place names from the list of XML trees.""" | |
places = set() | |
for tree in parsed_trees: | |
provenance_places = tree.xpath('//tei:provenance/tei:placeName', namespaces=NS) | |
for place in provenance_places: | |
if place.text and place.text.lower() != 'none': | |
places.add(place.text.strip()) | |
location_names = tree.xpath('//tei:location/tei:name[@type="place"]', namespaces=NS) | |
for name in location_names: | |
if name.text and name.text.lower() != 'none': | |
places.add(name.text.strip()) | |
contemporary_names = tree.xpath('//tei:div[@type="commentary"]//tei:name[@type="contemporary"]', namespaces=NS) | |
for name in contemporary_names: | |
if name.text and name.text.lower() != 'none': | |
places.add(name.text.strip()) | |
current_names = tree.xpath('//tei:name[@type="current"]', namespaces=NS) | |
for name in current_names: | |
if name.text and name.text.lower() != 'none': | |
places.add(name.text.strip()) | |
return sorted(places) | |
def build_author_mappings(parsed_trees, xml_files): | |
""" | |
Builds mappings from authors to their associated places and keywords. | |
Returns: | |
author_to_places (dict): Maps each author to a set of associated places. | |
author_to_keywords (dict): Maps each author to a set of associated keywords. | |
""" | |
author_to_places = defaultdict(set) | |
author_to_keywords = defaultdict(set) | |
for tree in parsed_trees: | |
# Extract authors | |
authors = set() | |
bib_authors = tree.xpath('//tei:bibl/tei:author/tei:persName', namespaces=NS) | |
for author in bib_authors: | |
if author.text: | |
authors.add(author.text.strip()) | |
# Extract places | |
places = set() | |
provenance_places = tree.xpath('//tei:provenance/tei:placeName', namespaces=NS) | |
for place in provenance_places: | |
if place.text and place.text.lower() != 'none': | |
places.add(place.text.strip()) | |
location_names = tree.xpath('//tei:location/tei:name[@type="place"]', namespaces=NS) | |
for name in location_names: | |
if name.text and name.text.lower() != 'none': | |
places.add(name.text.strip()) | |
contemporary_names = tree.xpath('//tei:div[@type="commentary"]//tei:name[@type="contemporary"]', namespaces=NS) | |
for name in contemporary_names: | |
if name.text and name.text.lower() != 'none': | |
places.add(name.text.strip()) | |
current_names = tree.xpath('//tei:name[@type="current"]', namespaces=NS) | |
for name in current_names: | |
if name.text and name.text.lower() != 'none': | |
places.add(name.text.strip()) | |
keywords = set() | |
keyword_items = tree.xpath('//tei:keywords/tei:list/tei:item', namespaces=NS) | |
for item in keyword_items: | |
if item.text: | |
parts = [kw.strip() for kw in item.text.split(',')] | |
keywords.update(parts) | |
for author in authors: | |
author_to_places[author].update(places) | |
author_to_keywords[author].update(keywords) | |
return author_to_places, author_to_keywords | |
def get_commentary(tree): | |
"""Extracts commentary sections from a single XML tree.""" | |
commentaries = tree.xpath('//tei:div[@type="commentary"]', namespaces=NS) | |
commentary_list = [] | |
for comm in commentaries: | |
subtype = comm.get('subtype', 'general') | |
content = etree.tostring(comm, pretty_print=True, encoding='unicode') | |
commentary_list.append({'subtype': subtype, 'content': content}) | |
return commentary_list | |
def get_editions(tree): | |
"""Extracts edition sections from a single XML tree.""" | |
editions = tree.xpath('//tei:div[@type="edition"]', namespaces=NS) | |
edition_list = [] | |
for edition in editions: | |
lang = edition.get('{http://www.w3.org/XML/1998/namespace}lang', 'unknown') | |
content = etree.tostring(edition, pretty_print=True, encoding='unicode') | |
edition_list.append({'lang': lang, 'content': content}) | |
return edition_list | |
def search_by_author(tree, author_query): | |
"""Searches for the author in titleStmt and bibliography.""" | |
results = [] | |
bib_authors = tree.xpath('//tei:bibl/tei:author/tei:persName', namespaces=NS) | |
for author in bib_authors: | |
if author.text and author_query.lower() in author.text.lower(): | |
results.append(f"Bibliography Author: {author.text}") | |
return results | |
def search_by_place(tree, place_query): | |
""" | |
Searches for the place in provenance, contemporary names, and location geo elements. | |
Parameters: | |
tree (etree.Element): Parsed XML tree. | |
place_query (str): The place name to search for. | |
Returns: | |
list: A list of strings describing where the place was found. | |
""" | |
results = [] | |
place_query_lower = place_query.lower() | |
provenance_places = tree.xpath('//tei:provenance/tei:placeName', namespaces=NS) | |
for place in provenance_places: | |
if place.text and place_query_lower in place.text.lower() and place.text.lower() != "none": | |
results.append(f"Provenance Place: {place.text.strip()}") | |
contemporary_names = tree.xpath( | |
'//tei:div[@type="commentary" and @subtype="general"]//tei:name[@type="contemporary"]', | |
namespaces=NS | |
) | |
for name in contemporary_names: | |
if name.text and place_query_lower in name.text.lower(): | |
results.append(f"Contemporary Name: {name.text.strip()}") | |
geo_elements = tree.xpath('//tei:location//tei:geo', namespaces=NS) | |
for geo in geo_elements: | |
if geo.text and place_query_lower in geo.text.lower() and geo.text.lower() != "none": | |
results.append(f"Location Geo: {geo.text.strip()}") | |
return results | |
def search_by_keyword(tree, keyword): | |
"""Searches for the keyword in keywords and commentary segments.""" | |
results = [] | |
keyword_items = tree.xpath('//tei:keywords/tei:list/tei:item', namespaces=NS) | |
for item in keyword_items: | |
if item.text and keyword.lower() in item.text.lower(): | |
results.append(f"Keyword: {item.text}") | |
commentary_segs = tree.xpath('//tei:div[@type="commentary"]//tei:seg', namespaces=NS) | |
for seg in commentary_segs: | |
if seg.text and keyword.lower() in seg.text.lower(): | |
results.append(f"Commentary Segment: {seg.text}") | |
return results | |
def display_tei_header(tree): | |
title = tree.xpath('//tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title', namespaces=NS) | |
author = tree.xpath('//tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:author/tei:persName', namespaces=NS) | |
publication = tree.xpath('//tei:teiHeader/tei:fileDesc/tei:publicationStmt/tei:publisher', namespaces=NS) | |
date = tree.xpath('//tei:teiHeader/tei:fileDesc/tei:publicationStmt/tei:date', namespaces=NS) | |
if title: | |
st.write(f"**Title:** {title[0].text}") | |
if author: | |
st.write(f"**Author:** {author[0].text}") | |
if publication: | |
st.write(f"**Publisher:** {publication[0].text}") | |
if date: | |
st.write(f"**Date:** {date[0].text}") | |
def display_code_wrapped(content): | |
""" | |
Custom function to display code with wrapping using st.markdown and HTML. | |
This avoids horizontal scrolling by wrapping long lines. | |
""" | |
st.markdown( | |
f""" | |
<div style="white-space: pre-wrap; word-wrap: break-word; font-size:14px; background-color: #f5f5f5; padding: 10px; border-radius: 5px; overflow: hidden;"> | |
<code>{content}</code> | |
</div> | |
""", | |
unsafe_allow_html=True | |
) | |
def main(): | |
st.set_page_config(page_title="DigitalSEE TEI XML Viewer", layout="wide") | |
st.markdown( | |
""" | |
<style> | |
/* Enable code wrapping in st.code blocks */ | |
pre, code { | |
white-space: pre-wrap !important; /* Allows wrapping */ | |
word-wrap: break-word !important; /* Breaks long words */ | |
overflow-x: hidden !important; /* Hides horizontal scrollbar */ | |
} | |
/* Adjust font size for better fit */ | |
.streamlit-expanderHeader, pre, code { | |
font-size: 14px !important; | |
} | |
/* Ensure the container doesn't force a minimum width */ | |
.streamlit-expander, .block-container { | |
max-width: 100% !important; | |
} | |
/* Optional: Style for the code background */ | |
pre { | |
background-color: #f5f5f5 !important; | |
padding: 10px !important; | |
border-radius: 5px !important; | |
} | |
</style> | |
""", | |
unsafe_allow_html=True | |
) | |
st.title("π DigitalSEE: TEI Navigator") | |
xml_folder = Path("./xmls") | |
if not xml_folder.exists() or not xml_folder.is_dir(): | |
st.error(f"The specified folder `{xml_folder}` does not exist or is not a directory.") | |
st.stop() | |
xml_files = list(xml_folder.glob("*.xml")) | |
if not xml_files: | |
st.info(f"No XML files found in the folder `{xml_folder}`.") | |
st.stop() | |
st.sidebar.header("π XML Files Overview") | |
st.sidebar.write(f"**Total XML Files Loaded:** {len(xml_files)}") | |
parsed_trees = [] | |
valid_files = [] | |
for file in xml_files: | |
tree = parse_xml(file) | |
if tree is not None: | |
parsed_trees.append(tree) | |
valid_files.append(file) | |
if not parsed_trees: | |
st.error("No valid XML files were parsed successfully.") | |
st.stop() | |
all_authors = get_all_authors(parsed_trees) | |
all_keywords = get_all_keywords(parsed_trees) | |
all_place_names = get_all_place_names(parsed_trees) | |
author_to_places, author_to_keywords = build_author_mappings(parsed_trees, valid_files) | |
st.header("π Search TEI XML Files") | |
search_col1, search_col2, search_col3 = st.columns(3) | |
with search_col1: | |
st.markdown("**Search by Author**") | |
selected_author = st.selectbox("Select Author", options=["-- Select Author --"] + all_authors, key="author_select") | |
if selected_author != "-- Select Author --": | |
filtered_places = sorted(author_to_places[selected_author]) | |
filtered_keywords = sorted(author_to_keywords[selected_author]) | |
else: | |
filtered_places = all_place_names | |
filtered_keywords = all_keywords | |
with search_col2: | |
st.markdown("**Search by Place Name**") | |
selected_place = st.selectbox("Select Place", options=["-- Select Place --"] + filtered_places, key="place_select") | |
with search_col3: | |
st.markdown("**Search by Keyword**") | |
selected_keyword = st.selectbox("Select Keyword", options=["-- Select Keyword --"] + filtered_keywords, key="keyword_select") | |
if st.button("π Search"): | |
st.subheader("π Search Results") | |
matched_files = set(valid_files) | |
if selected_author != "-- Select Author --": | |
author_matched = set() | |
for tree, file in zip(parsed_trees, valid_files): | |
if search_by_author(tree, selected_author): | |
author_matched.add(file) | |
matched_files = matched_files.intersection(author_matched) | |
if selected_place != "-- Select Place --": | |
place_matched = set() | |
for tree, file in zip(parsed_trees, valid_files): | |
if search_by_place(tree, selected_place): | |
place_matched.add(file) | |
matched_files = matched_files.intersection(place_matched) | |
if selected_keyword != "-- Select Keyword --": | |
keyword_matched = set() | |
for tree, file in zip(parsed_trees, valid_files): | |
if search_by_keyword(tree, selected_keyword): | |
keyword_matched.add(file) | |
matched_files = matched_files.intersection(keyword_matched) | |
if matched_files: | |
st.write(f"**Total Matches:** {len(matched_files)}") | |
for file in matched_files: | |
tree = parse_xml(file) | |
if tree is not None: | |
with st.expander(f"π {file.name}"): | |
display_tei_header(tree) | |
commentaries = get_commentary(tree) | |
if commentaries: | |
st.markdown("**Commentary Sections:**") | |
for idx, comm in enumerate(commentaries, start=1): | |
st.markdown(f"**Commentary {idx} - {comm['subtype']}**") | |
st.code(comm['content'], language='xml') | |
else: | |
st.write("No commentary sections found.") | |
editions = get_editions(tree) | |
if editions: | |
st.markdown("**Edition Sections:**") | |
for idx, edition in enumerate(editions, start=1): | |
st.markdown(f"**Edition {idx} - Language: {edition['lang']}**") | |
st.code(edition['content'], language='xml') | |
else: | |
st.write("No edition sections found.") | |
associated_places = sorted(author_to_places.get(selected_author, set())) if selected_author != "-- Select Author --" else sorted(get_all_place_names([tree])) | |
associated_keywords = sorted(author_to_keywords.get(selected_author, set())) if selected_author != "-- Select Author --" else sorted(get_all_keywords([tree])) | |
if associated_places: | |
st.markdown("**Associated Places:**") | |
st.write(", ".join(associated_places)) | |
if associated_keywords: | |
st.markdown("**Associated Keywords:**") | |
st.write(", ".join(associated_keywords)) | |
buffer = BytesIO() | |
tree.write(buffer, pretty_print=True, encoding='utf-8', xml_declaration=True) | |
buffer.seek(0) | |
st.download_button( | |
label="π₯ Download XML", | |
data=buffer, | |
file_name=f"matched_{file.name}", | |
mime="application/xml" | |
) | |
else: | |
st.write("No matching files found for the given search criteria.") | |
with st.expander("π View All Loaded XML Files"): | |
for tree, file in zip(parsed_trees, valid_files): | |
with st.container(): | |
st.markdown(f"### π {file.name}") | |
display_tei_header(tree) | |
commentaries = get_commentary(tree) | |
if commentaries: | |
st.markdown("**Commentary Sections:**") | |
for idx, comm in enumerate(commentaries, start=1): | |
st.markdown(f"**Commentary {idx} - {comm['subtype']}**") | |
st.code(comm['content'], language='xml') | |
else: | |
st.write("No commentary sections found.") | |
editions = get_editions(tree) | |
if editions: | |
st.markdown("**Edition Sections:**") | |
for idx, edition in enumerate(editions, start=1): | |
st.markdown(f"**Edition {idx} - Language: {edition['lang']}**") | |
st.code(edition['content'], language='xml') | |
else: | |
st.write("No edition sections found.") | |
st.sidebar.markdown("---") | |
st.sidebar.header("Simple Querying Interface") | |
st.sidebar.write( | |
"Quickly search and filter TEI XML files to find relevant information or themes." | |
) | |
st.sidebar.header("XML Code Viewer") | |
st.sidebar.write( | |
"View detailed XML code for commentaries and editions in their original format." | |
) | |
st.sidebar.header("Downloadable Entries") | |
st.sidebar.write( | |
"Download entries for offline access and further analysis." | |
) | |
st.sidebar.header("Comprehensive Meta Information") | |
st.sidebar.write( | |
"Each entry includes rich metadata, such as XML file author details." | |
) | |
if __name__ == "__main__": | |
main() | |