"""
Parse documents, currently pdf and xml are supported.
"""

import os

from langchain.document_loaders import (
    PyMuPDFLoader,
)
from langchain.docstore.document import Document
from langchain.text_splitter import (
    # RecursiveCharacterTextSplitter,
    SpacyTextSplitter,
)


def load_pdf_as_docs(pdf_path, loader_module=None, load_kwargs=None):
    """Load and parse pdf file(s)."""

    if pdf_path.endswith(".pdf"):  # single file
        pdf_docs = [pdf_path]
    else:  # a directory
        pdf_docs = [
            os.path.join(pdf_path, f)
            for f in os.listdir(pdf_path)
            if f.endswith(".pdf")
        ]

    if load_kwargs is None:
        load_kwargs = {}

    docs = []
    if loader_module is None:  # set pdf loader
        loader_module = PyMuPDFLoader
    for pdf in pdf_docs:
        loader = loader_module(pdf, **load_kwargs)
        doc = loader.load()
        docs.extend(doc)

    return docs


def load_xml_as_docs(xml_path, loader_module=None, load_kwargs=None):
    """Load and parse xml file(s)."""

    from bs4 import BeautifulSoup
    from unstructured.cleaners.core import group_broken_paragraphs

    if xml_path.endswith(".xml"):  # single file
        xml_docs = [xml_path]
    else:  # a directory
        xml_docs = [
            os.path.join(xml_path, f)
            for f in os.listdir(xml_path)
            if f.endswith(".xml")
        ]

    if load_kwargs is None:
        load_kwargs = {}

    docs = []
    for xml_file in xml_docs:
        with open(xml_file) as fp:
            soup = BeautifulSoup(
                fp, features="xml"
            )  # txt is simply the a string with your XML file
            pageText = soup.findAll(string=True)
            parsed_text = "\n".join(pageText)  # or " ".join, seems similar
            # Clean text
            parsed_text_grouped = group_broken_paragraphs(parsed_text)

            # get metadata
            try:
                from lxml import etree as ET

                tree = ET.parse(xml_file)

                # Define namespace
                ns = {"tei": "http://www.tei-c.org/ns/1.0"}
                # Read Author personal names as an example
                pers_name_elements = tree.xpath(
                    "tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:author/tei:persName",
                    namespaces=ns,
                )
                first_per = pers_name_elements[0].text
                author_info = first_per + " et al"

                title_elements = tree.xpath(
                    "tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title", namespaces=ns
                )
                title = title_elements[0].text

                # Combine source info
                source_info = "_".join([author_info, title])
            except:
                source_info = "unknown"

            # maybe even better parsing method. TODO: discuss with TUD
            # first_author = soup.find("author")
            # publication_year = soup.find("date", attrs={'type': 'published'})
            # title = soup.find("title")
            # source_info = [first_author, publication_year, title]
            # source_info_str = "_".join([info.text.strip() if info is not None else "unknown" for info in source_info])

            doc = [
                Document(
                    page_content=parsed_text_grouped, metadata={"source": source_info}
                )
            ]

            docs.extend(doc)

    return docs


def get_doc_chunks(docs, splitter=None):
    """Split docs into chunks."""

    if splitter is None:
        # splitter = RecursiveCharacterTextSplitter(  # original default
        #    # separators=["\n\n", "\n"], chunk_size=1024, chunk_overlap=256
        #    separators=["\n\n", "\n"], chunk_size=256, chunk_overlap=128
        # )
        # Spacy seems better
        splitter = SpacyTextSplitter.from_tiktoken_encoder(
            chunk_size=512,
            chunk_overlap=128,
        )
    chunks = splitter.split_documents(docs)

    return chunks