Spaces:

bestroi
/

digitalsee-tei-navigator

Sleeping

App Files Files Community

digitalsee-tei-navigator / app.py

bestroi

Update app.py

0831049 verified 4 months ago

raw

history blame

18.2 kB

	import streamlit as st
	from lxml import etree
	from pathlib import Path
	from io import BytesIO
	from collections import defaultdict

	NS = {'tei': 'http://www.tei-c.org/ns/1.0'}

	def parse_xml(file_path):
	"""Parses an XML file and returns the tree."""
	try:
	parser = etree.XMLParser(remove_blank_text=True)
	tree = etree.parse(str(file_path), parser)
	return tree
	except Exception as e:
	st.error(f"Error parsing XML file `{file_path.name}`: {e}")
	return None

	def get_all_authors(parsed_trees):
	"""Extracts all unique authors from the list of XML trees."""
	authors = set()
	for tree in parsed_trees:
	# From bibliography
	bib_authors = tree.xpath('//tei:bibl/tei:author/tei:persName', namespaces=NS)
	for author in bib_authors:
	if author.text:
	authors.add(author.text.strip())
	return sorted(authors)

	def get_all_keywords(parsed_trees):
	"""Extracts all unique keywords from the list of XML trees."""
	keywords = set()
	for tree in parsed_trees:
	keyword_items = tree.xpath('//tei:keywords/tei:list/tei:item', namespaces=NS)
	for item in keyword_items:
	if item.text:
	parts = [kw.strip() for kw in item.text.split(',')]
	keywords.update(parts)
	return sorted(keywords)

	def get_all_place_names(parsed_trees):
	"""Extracts all unique place names from the list of XML trees."""
	places = set()
	for tree in parsed_trees:
	provenance_places = tree.xpath('//tei:provenance/tei:placeName', namespaces=NS)
	for place in provenance_places:
	if place.text and place.text.lower() != 'none':
	places.add(place.text.strip())

	location_names = tree.xpath('//tei:location/tei:name[@type="place"]', namespaces=NS)
	for name in location_names:
	if name.text and name.text.lower() != 'none':
	places.add(name.text.strip())

	contemporary_names = tree.xpath('//tei:div[@type="commentary"]//tei:name[@type="contemporary"]', namespaces=NS)
	for name in contemporary_names:
	if name.text and name.text.lower() != 'none':
	places.add(name.text.strip())

	current_names = tree.xpath('//tei:name[@type="current"]', namespaces=NS)
	for name in current_names:
	if name.text and name.text.lower() != 'none':
	places.add(name.text.strip())
	return sorted(places)

	def build_author_mappings(parsed_trees, xml_files):
	"""
	Builds mappings from authors to their associated places and keywords.

	Returns:
	author_to_places (dict): Maps each author to a set of associated places.
	author_to_keywords (dict): Maps each author to a set of associated keywords.
	"""
	author_to_places = defaultdict(set)
	author_to_keywords = defaultdict(set)

	for tree in parsed_trees:
	# Extract authors
	authors = set()
	bib_authors = tree.xpath('//tei:bibl/tei:author/tei:persName', namespaces=NS)
	for author in bib_authors:
	if author.text:
	authors.add(author.text.strip())

	# Extract places
	places = set()
	provenance_places = tree.xpath('//tei:provenance/tei:placeName', namespaces=NS)
	for place in provenance_places:
	if place.text and place.text.lower() != 'none':
	places.add(place.text.strip())

	location_names = tree.xpath('//tei:location/tei:name[@type="place"]', namespaces=NS)
	for name in location_names:
	if name.text and name.text.lower() != 'none':
	places.add(name.text.strip())

	contemporary_names = tree.xpath('//tei:div[@type="commentary"]//tei:name[@type="contemporary"]', namespaces=NS)
	for name in contemporary_names:
	if name.text and name.text.lower() != 'none':
	places.add(name.text.strip())

	current_names = tree.xpath('//tei:name[@type="current"]', namespaces=NS)
	for name in current_names:
	if name.text and name.text.lower() != 'none':
	places.add(name.text.strip())

	keywords = set()
	keyword_items = tree.xpath('//tei:keywords/tei:list/tei:item', namespaces=NS)
	for item in keyword_items:
	if item.text:
	parts = [kw.strip() for kw in item.text.split(',')]
	keywords.update(parts)

	for author in authors:
	author_to_places[author].update(places)
	author_to_keywords[author].update(keywords)

	return author_to_places, author_to_keywords

	def get_commentary(tree):
	"""Extracts commentary sections from a single XML tree."""
	commentaries = tree.xpath('//tei:div[@type="commentary"]', namespaces=NS)
	commentary_list = []
	for comm in commentaries:
	subtype = comm.get('subtype', 'general')
	content = etree.tostring(comm, pretty_print=True, encoding='unicode')
	commentary_list.append({'subtype': subtype, 'content': content})
	return commentary_list

	def get_editions(tree):
	"""Extracts edition sections from a single XML tree."""
	editions = tree.xpath('//tei:div[@type="edition"]', namespaces=NS)
	edition_list = []
	for edition in editions:

	lang = edition.get('{http://www.w3.org/XML/1998/namespace}lang', 'unknown')
	content = etree.tostring(edition, pretty_print=True, encoding='unicode')
	edition_list.append({'lang': lang, 'content': content})
	return edition_list

	def search_by_author(tree, author_query):
	"""Searches for the author in titleStmt and bibliography."""
	results = []

	bib_authors = tree.xpath('//tei:bibl/tei:author/tei:persName', namespaces=NS)
	for author in bib_authors:
	if author.text and author_query.lower() in author.text.lower():
	results.append(f"Bibliography Author: {author.text}")
	return results

	def search_by_place(tree, place_query):
	"""
	Searches for the place in provenance, contemporary names, and location geo elements.

	Parameters:
	tree (etree.Element): Parsed XML tree.
	place_query (str): The place name to search for.

	Returns:
	list: A list of strings describing where the place was found.
	"""
	results = []
	place_query_lower = place_query.lower()

	provenance_places = tree.xpath('//tei:provenance/tei:placeName', namespaces=NS)
	for place in provenance_places:
	if place.text and place_query_lower in place.text.lower() and place.text.lower() != "none":
	results.append(f"Provenance Place: {place.text.strip()}")

	contemporary_names = tree.xpath(
	'//tei:div[@type="commentary" and @subtype="general"]//tei:name[@type="contemporary"]',
	namespaces=NS
	)
	for name in contemporary_names:
	if name.text and place_query_lower in name.text.lower():
	results.append(f"Contemporary Name: {name.text.strip()}")

	geo_elements = tree.xpath('//tei:location//tei:geo', namespaces=NS)
	for geo in geo_elements:
	if geo.text and place_query_lower in geo.text.lower() and geo.text.lower() != "none":
	results.append(f"Location Geo: {geo.text.strip()}")

	return results

	def search_by_keyword(tree, keyword):
	"""Searches for the keyword in keywords and commentary segments."""
	results = []
	keyword_items = tree.xpath('//tei:keywords/tei:list/tei:item', namespaces=NS)
	for item in keyword_items:
	if item.text and keyword.lower() in item.text.lower():
	results.append(f"Keyword: {item.text}")
	commentary_segs = tree.xpath('//tei:div[@type="commentary"]//tei:seg', namespaces=NS)
	for seg in commentary_segs:
	if seg.text and keyword.lower() in seg.text.lower():
	results.append(f"Commentary Segment: {seg.text}")
	return results

	def display_tei_header(tree):
	title = tree.xpath('//tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title', namespaces=NS)
	author = tree.xpath('//tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:author/tei:persName', namespaces=NS)
	publication = tree.xpath('//tei:teiHeader/tei:fileDesc/tei:publicationStmt/tei:publisher', namespaces=NS)
	date = tree.xpath('//tei:teiHeader/tei:fileDesc/tei:publicationStmt/tei:date', namespaces=NS)

	if title:
	st.write(f"Title: {title[0].text}")
	if author:
	st.write(f"Author: {author[0].text}")
	if publication:
	st.write(f"Publisher: {publication[0].text}")
	if date:
	st.write(f"Date: {date[0].text}")

	def display_code_wrapped(content):
	"""
	Custom function to display code with wrapping using st.markdown and HTML.
	This avoids horizontal scrolling by wrapping long lines.
	"""
	st.markdown(
	f"""
	<div style="white-space: pre-wrap; word-wrap: break-word; font-size:14px; background-color: #f5f5f5; padding: 10px; border-radius: 5px; overflow: hidden;">
	<code>{content}</code>
	</div>
	""",
	unsafe_allow_html=True
	)

	def main():
	st.set_page_config(page_title="DigitalSEE TEI XML Viewer", layout="wide")

	st.markdown(
	"""
	<style>
	/* Enable code wrapping in st.code blocks */
	pre, code {
	white-space: pre-wrap !important; /* Allows wrapping */
	word-wrap: break-word !important; /* Breaks long words */
	overflow-x: hidden !important; /* Hides horizontal scrollbar */
	}
	/* Adjust font size for better fit */
	.streamlit-expanderHeader, pre, code {
	font-size: 14px !important;
	}
	/* Ensure the container doesn't force a minimum width */
	.streamlit-expander, .block-container {
	max-width: 100% !important;
	}
	/* Optional: Style for the code background */
	pre {
	background-color: #f5f5f5 !important;
	padding: 10px !important;
	border-radius: 5px !important;
	}
	</style>
	""",
	unsafe_allow_html=True
	)

	st.title("📄 DigitalSEE: TEI Navigator")


	xml_folder = Path("./xmls")

	if not xml_folder.exists() or not xml_folder.is_dir():
	st.error(f"The specified folder `{xml_folder}` does not exist or is not a directory.")
	st.stop()

	xml_files = list(xml_folder.glob("*.xml"))
	if not xml_files:
	st.info(f"No XML files found in the folder `{xml_folder}`.")
	st.stop()

	st.sidebar.header("📂 XML Files Overview")
	st.sidebar.write(f"Total XML Files Loaded: {len(xml_files)}")

	parsed_trees = []
	valid_files = []
	for file in xml_files:
	tree = parse_xml(file)
	if tree is not None:
	parsed_trees.append(tree)
	valid_files.append(file)

	if not parsed_trees:
	st.error("No valid XML files were parsed successfully.")
	st.stop()

	all_authors = get_all_authors(parsed_trees)
	all_keywords = get_all_keywords(parsed_trees)
	all_place_names = get_all_place_names(parsed_trees)

	author_to_places, author_to_keywords = build_author_mappings(parsed_trees, valid_files)

	st.header("🔍 Search TEI XML Files")

	search_col1, search_col2, search_col3 = st.columns(3)

	with search_col1:
	st.markdown("Search by Author")
	selected_author = st.selectbox("Select Author", options=["-- Select Author --"] + all_authors, key="author_select")

	if selected_author != "-- Select Author --":
	filtered_places = sorted(author_to_places[selected_author])
	filtered_keywords = sorted(author_to_keywords[selected_author])
	else:
	filtered_places = all_place_names
	filtered_keywords = all_keywords

	with search_col2:
	st.markdown("Search by Place Name")
	selected_place = st.selectbox("Select Place", options=["-- Select Place --"] + filtered_places, key="place_select")

	with search_col3:
	st.markdown("Search by Keyword")
	selected_keyword = st.selectbox("Select Keyword", options=["-- Select Keyword --"] + filtered_keywords, key="keyword_select")


	if st.button("🔎 Search"):
	st.subheader("🔗 Search Results")

	matched_files = set(valid_files)


	if selected_author != "-- Select Author --":
	author_matched = set()
	for tree, file in zip(parsed_trees, valid_files):
	if search_by_author(tree, selected_author):
	author_matched.add(file)
	matched_files = matched_files.intersection(author_matched)


	if selected_place != "-- Select Place --":
	place_matched = set()
	for tree, file in zip(parsed_trees, valid_files):
	if search_by_place(tree, selected_place):
	place_matched.add(file)
	matched_files = matched_files.intersection(place_matched)


	if selected_keyword != "-- Select Keyword --":
	keyword_matched = set()
	for tree, file in zip(parsed_trees, valid_files):
	if search_by_keyword(tree, selected_keyword):
	keyword_matched.add(file)
	matched_files = matched_files.intersection(keyword_matched)

	if matched_files:
	st.write(f"Total Matches: {len(matched_files)}")
	for file in matched_files:
	tree = parse_xml(file)
	if tree is not None:
	with st.expander(f"📄 {file.name}"):
	display_tei_header(tree)

	commentaries = get_commentary(tree)
	if commentaries:
	st.markdown("Commentary Sections:")
	for idx, comm in enumerate(commentaries, start=1):
	st.markdown(f"Commentary {idx} - {comm['subtype']}")
	st.code(comm['content'], language='xml')

	else:
	st.write("No commentary sections found.")

	editions = get_editions(tree)
	if editions:
	st.markdown("Edition Sections:")
	for idx, edition in enumerate(editions, start=1):
	st.markdown(f"Edition {idx} - Language: {edition['lang']}")
	st.code(edition['content'], language='xml')

	else:
	st.write("No edition sections found.")

	associated_places = sorted(author_to_places.get(selected_author, set())) if selected_author != "-- Select Author --" else sorted(get_all_place_names([tree]))
	associated_keywords = sorted(author_to_keywords.get(selected_author, set())) if selected_author != "-- Select Author --" else sorted(get_all_keywords([tree]))

	if associated_places:
	st.markdown("Associated Places:")
	st.write(", ".join(associated_places))
	if associated_keywords:
	st.markdown("Associated Keywords:")
	st.write(", ".join(associated_keywords))


	buffer = BytesIO()
	tree.write(buffer, pretty_print=True, encoding='utf-8', xml_declaration=True)
	buffer.seek(0)
	st.download_button(
	label="📥 Download XML",
	data=buffer,
	file_name=f"matched_{file.name}",
	mime="application/xml"
	)
	else:
	st.write("No matching files found for the given search criteria.")

	with st.expander("📚 View All Loaded XML Files"):
	for tree, file in zip(parsed_trees, valid_files):
	with st.container():
	st.markdown(f"### 📄 {file.name}")
	display_tei_header(tree)

	commentaries = get_commentary(tree)
	if commentaries:
	st.markdown("Commentary Sections:")
	for idx, comm in enumerate(commentaries, start=1):
	st.markdown(f"Commentary {idx} - {comm['subtype']}")
	st.code(comm['content'], language='xml')

	else:
	st.write("No commentary sections found.")

	editions = get_editions(tree)
	if editions:
	st.markdown("Edition Sections:")
	for idx, edition in enumerate(editions, start=1):
	st.markdown(f"Edition {idx} - Language: {edition['lang']}")
	st.code(edition['content'], language='xml')

	else:
	st.write("No edition sections found.")

	st.sidebar.markdown("---")
	st.sidebar.header("Simple Querying Interface")
	st.sidebar.write(
	"Quickly search and filter TEI XML files to find relevant information or themes."
	)

	st.sidebar.header("XML Code Viewer")
	st.sidebar.write(
	"View detailed XML code for commentaries and editions in their original format."
	)

	st.sidebar.header("Downloadable Entries")
	st.sidebar.write(
	"Download entries for offline access and further analysis."
	)

	st.sidebar.header("Comprehensive Meta Information")
	st.sidebar.write(
	"Each entry includes rich metadata, such as XML file author details."
	)



	if __name__ == "__main__":
	main()