Spaces:

JBDENIS
/

CV

Building

App Files Files Community

jdenis-insn commited on Feb 5

Commit

737f55b

1 Parent(s): de91b1c

init commit for build

Browse files

Files changed (29) hide show

.gitignore +168 -0
Dockerfile +43 -0
README.md +2 -2
backend/app/__init__.py +3 -0
backend/app/internal/__init__.py +0 -0
backend/app/internal/bdd_manager.py +129 -0
backend/app/internal/embedder.py +26 -0
backend/app/internal/export_report.py +95 -0
backend/app/internal/llm_chat.py +346 -0
backend/app/internal/parser.py +104 -0
backend/app/internal/template_prompt.py +79 -0
backend/app/main.py +32 -0
backend/app/resources/logo_ademe.png +0 -0
backend/app/routers/__init__.py +0 -0
backend/app/routers/chatting.py +215 -0
backend/app/routers/embedding.py +169 -0
backend/app/settings.py +30 -0
backend/test/test_main.py +6 -0
dockerignore +17 -0
frontend/app/__init__.py +0 -0
frontend/app/main.py +200 -0
frontend/app/resources/aide-financiere-ademe.JPG +0 -0
frontend/app/resources/logo_ademe.png +0 -0
frontend/app/settings.py +11 -0
frontend/test/test_main.py +6 -0
nginx.conf +36 -0
pyproject.toml +144 -0
supervisord.conf +28 -0
uv.lock +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,168 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env*
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# BDD
+qdrant_storage/
+memory
+*.pdf

Dockerfile ADDED Viewed

	@@ -0,0 +1,43 @@

+# Étape 1 : Image de base et installation des dépendances système
+FROM python:3.12-slim as base
+RUN apt-get update && apt-get install -y \
+    nginx \
+    supervisor \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# Étape 2 : Création de l'utilisateur 'user' avec l'ID 1000
+RUN useradd -m -u 1000 user
+# Étape 3 : Définition des variables d'environnement et du répertoire de travail
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+# Étape 4 : Installation de 'uv' (gestionnaire de projet Python)
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+# Étape 5 : Copie des fichiers de configuration avec les permissions appropriées
+COPY --chown=user:user nginx.conf /etc/nginx/nginx.conf
+COPY --chown=user:user supervisord.conf /etc/supervisor/conf.d/supervisord.conf
+# Étape 6 : Copie des fichiers de l'application avec les permissions appropriées
+COPY --chown=user:user . $HOME/app
+# Étape 7 : Installation des dépendances Python
+COPY --chown=user:user pyproject.toml uv.lock ./
+RUN uv sync --no-dev --frozen --no-cache
+# Étape 8 : Téléchargement et installation de Qdrant
+RUN curl -fsSL https://github.com/qdrant/qdrant/releases/latest/download/qdrant-linux-x86_64 -o /usr/local/bin/qdrant \
+    && chmod +x /usr/local/bin/qdrant
+# Étape 9 : Exposition des ports nécessaires
+EXPOSE 80 6333
+# Étape 10 : Changement de l'utilisateur pour 'user'
+USER user
+# Étape 11 : Lancement de l'application avec supervisord
+CMD ["supervisord", "-n", "-c", "/etc/supervisor/conf.d/supervisord.conf"]

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: CV
 emoji: 👁
 colorFrom: indigo
-colorTo: pink
 sdk: docker
 pinned: false
 license: unlicense

 ---
+title: CV_JBDENIS
 emoji: 👁
 colorFrom: indigo
+colorTo: red
 sdk: docker
 pinned: false
 license: unlicense

backend/app/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+def hello() -> str:
+    """Return greetings."""
+    return "Hello from my-app!"

backend/app/internal/__init__.py ADDED Viewed

File without changes

backend/app/internal/bdd_manager.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from typing import Any
+from langchain_qdrant import QdrantVectorStore
+from qdrant_client import QdrantClient
+from qdrant_client.http.models import Distance, VectorParams
+from qdrant_client.http.exceptions import UnexpectedResponse
+from qdrant_client.http.models import CollectionStatus
+from langchain.vectorstores.base import VectorStoreRetriever
+from langchain.retrievers import EnsembleRetriever
+from langchain_core.vectorstores import VectorStore
+from app.settings import settings
+try:
+    client = QdrantClient(url=settings.qdrant_url)
+except Exception as e:
+    raise Exception(f"Error connecting to Qdrant: {e}")
+def create_collection(collection_name: str):
+    """
+    Create a collection in Qdrant if it does not already exist.
+    Args:
+        collection_name (str): The name of the collection to be created.
+    Returns:
+        str: A message indicating the result of the operation.
+    Raises:
+        Exception: If there is an error during the collection creation process.
+    """
+    try:
+        existing_collections = client.get_collections()
+        if any(col.name == collection_name for col in existing_collections.collections):
+            return f"Collection '{collection_name}' already exists."
+        client.create_collection(
+            collection_name=collection_name,
+            vectors_config=VectorParams(size=768, distance=Distance.COSINE),
+        )
+        return f"Collection '{collection_name}' created successfully."
+    except Exception as e:
+        raise Exception(f"Error creating collection '{collection_name}': {e}")
+def get_vector_store(embeddings, collection_name):
+    """
+    Retrieve or initialize a Qdrant vector store for a given collection.
+    Args:
+        embeddings: The embedding model or function to be used for vectorization.
+        collection_name (str): The name of the Qdrant collection.
+    Returns:
+        QdrantVectorStore: A Qdrant vector store object tied to the specified collection.
+    Raises:
+        Exception: If the collection does not exist or there is an issue accessing it.
+    """
+    try:
+        collection_info = client.get_collection(collection_name)
+        if collection_info.status != CollectionStatus.GREEN:
+            raise Exception(
+                f"Collection '{collection_name}' is not active (status: {
+                    collection_info.status})."
+            )
+        return QdrantVectorStore(
+            client=client, collection_name=collection_name, embedding=embeddings
+        )
+    except UnexpectedResponse as e:
+        raise Exception(
+            f"Collection '{
+                collection_name}' does not exist or could not be accessed: {e}"
+        )
+    except Exception as e:
+        raise Exception(
+            f"An error occurred while retrieving the vector store for '{
+                collection_name}': {e}"
+        )
+def get_retriever(vector_store: VectorStore) -> VectorStoreRetriever:
+    """
+    Converts a vector store into a retriever instance.
+    Args:
+        vector_store: An object that represents the vector store. It must have an `as_retriever` method.
+    Returns
+    -------
+        VectorStoreRetriever: An instance of VectorStoreRetriever for querying the vector store.
+    Raises
+    ------
+        AttributeError: If the provided vector store does not have an `as_retriever` method.
+    """  # noqa: D401
+    if not hasattr(vector_store, "as_retriever"):
+        raise AttributeError(
+            "The provided vector store does not have an 'as_retriever' method."
+        )
+    return vector_store.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.7})
+def get_ensemble_retriever(
+    retriever_doc: VectorStoreRetriever, retriever_user: VectorStoreRetriever
+) -> EnsembleRetriever:
+    """
+    Create an ensemble retriever that combines two retrievers with specified weights.
+    Args:
+        retriever_doc (VectorStoreRetriever): The first retriever,
+        typically for document retrieval.
+        retriever_user (VectorStoreRetriever): The second retriever,
+        typically for user-specific retrieval.
+    Returns:
+        EnsembleRetriever: An instance of `EnsembleRetriever` combining the two retrievers
+        with the specified weights (0.2 for `retriever_doc` and 0.8 for `retriever_user`).
+    """
+    return EnsembleRetriever(
+        retrievers=[retriever_doc, retriever_user], weights=[0.2, 0.8]
+    )

backend/app/internal/embedder.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from langchain_openai import OpenAIEmbeddings
+from langchain_huggingface.embeddings import HuggingFaceEmbeddings
+from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
+from app.settings import settings
+def get_embedder(provider: str = "hf_api"):
+    if provider == "hf_local":
+        return HuggingFaceEmbeddings(
+            model_name=settings.embedding_model_name,
+        )
+    if provider == "hf_api":
+        return HuggingFaceInferenceAPIEmbeddings(
+            model_name=settings.embedding_model_name,
+            api_key=settings.hf_token,
+        )
+    if provider == "openai":
+        return OpenAIEmbeddings(
+            openai_api_key=settings.scw_api_key,
+            openai_api_base=settings.scw_generative_apis_endpoint,
+            model=settings.embedding_model_name,
+            tiktoken_enabled=False,
+        )
+    return None

backend/app/internal/export_report.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import logging
+import re
+from typing import List
+from fpdf import FPDF
+from datetime import datetime
+# Initialisation du logger
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def create_pdf_report(output_path: str, logo_path: str, report_text: str):
+    """
+    Creates a PDF report with a logo, the current date, and a given text.
+    Args:
+        output_path (str): The path where the generated PDF will be saved.
+        logo_path (str): The path to the logo image to include in the report.
+        report_text (str): The text content to include in the report.
+    Returns
+    -------
+        None: The function saves the PDF to the specified output path.
+    Raises
+    ------
+        FileNotFoundError: If the logo file does not exist.
+        ValueError: If the provided paths or text are invalid.
+    """
+    pdf = FPDF()
+    pdf.add_page()
+    # Set font for the document
+    pdf.set_font("Arial", size=12)
+    # Add logo
+    try:
+        pdf.image(logo_path, x=10, y=8, w=30)
+    except FileNotFoundError:
+        raise FileNotFoundError(f"Logo file not found at: {logo_path}")  # noqa: B904
+    # Add title
+    pdf.set_font("Arial", style="B", size=16)
+    pdf.cell(200, 10, txt="Rapport de conversation avec Dis-ADEME", ln=True, align="C")
+    # Add date
+    pdf.set_font("Arial", size=12)
+    creation_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    pdf.ln(10)  # Add some space
+    pdf.cell(
+        200,
+        10,
+        txt=f"Date de création : {
+             creation_date}",
+        ln=True,
+        align="R",
+    )
+    # Add content
+    pdf.ln(20)  # Add some space
+    pdf.set_font("Arial", size=12)
+    pdf.multi_cell(0, 10, txt=report_text)
+    # Save the PDF
+    try:
+        pdf.output(output_path)
+        logger.info(f"PDF report created successfully at: {output_path}")
+    except Exception as e:  # noqa: BLE001
+        raise ValueError(f"Failed to save PDF: {e}")  # noqa: B904
+def extract_pdf_references(conversation: List[dict]) -> List[str]:
+    """
+    Extract unique PDF references from the chatbot's responses in the conversation.
+    Args:
+        conversation (List[dict]): List of dictionaries representing the conversation.
+                                   Each dictionary contains 'role' ('user' or 'assistant')
+                                   and 'content' (message string).
+    Returns:
+        List[str]: A list of unique PDF references mentioned in the chatbot's responses.
+    """
+    pdf_references = set()
+    for message in conversation:
+        if (
+            message.get("role") == "assistant"
+            and "Consultez les documents suivants pour plus d'information:"
+            in message.get("content", "")
+        ):
+            # Extract all PDF file names using regex
+            matches = re.findall(r"[\w\s-]+\.pdf", message["content"])
+            pdf_references.update(matches)
+    return sorted(pdf_references)

backend/app/internal/llm_chat.py ADDED Viewed

	@@ -0,0 +1,346 @@

+import uuid
+from typing import Any, Callable, Dict, List
+from langchain.chains import create_history_aware_retriever, create_retrieval_chain
+from langchain.chains.base import Chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_community.chat_message_histories import ChatMessageHistory
+from langchain_core.chat_history import BaseChatMessageHistory
+from langchain_core.messages import AIMessage, HumanMessage
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables.history import RunnableWithMessageHistory
+from langchain_openai import ChatOpenAI
+from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
+from app.internal.export_report import extract_pdf_references
+from app.internal.template_prompt import summary_system_prompt
+from app.settings import settings
+def get_chat_llm() -> BaseChatModel:
+    """
+    Initializes and returns a ChatOpenAI instance configured with the provided settings.
+    Returns:
+        ChatOpenAI: An instance of ChatOpenAI configured to use the specified model, API endpoint, and API key.
+    Raises:
+        ValueError: If any of the required settings (endpoint, API key, or model name) is missing.
+    """
+    try:
+        if settings.provider == "hf_local":
+            pass
+        if settings.provider == "hf_api":
+            if not settings.hf_token:
+                raise ValueError("The HugginFace APIs token is not set.")
+            llm = HuggingFaceEndpoint(
+                repo_id=settings.llm_model_name,
+                task="text-generation",
+                max_new_tokens=settings.max_length,
+                do_sample=False,
+                repetition_penalty=1.03,
+                temperature=settings.temperature,
+                # huggingfacehub_api_token=settings.hf_token,
+            )
+            return ChatHuggingFace(llm=llm)
+        if settings.provider == "openai":
+            if not settings.scw_generative_apis_endpoint:
+                raise ValueError("The SCW Generative APIs endpoint is not set.")
+            if not settings.scw_api_key:
+                raise ValueError("The SCW API key is not set.")
+            if not settings.llm_model_name:
+                raise ValueError("The LLM model name is not set.")
+            return ChatOpenAI(
+                base_url=settings.scw_generative_apis_endpoint,
+                api_key=settings.scw_api_key,
+                model=settings.llm_model_name,
+                temperature=settings.temperature,
+            )
+    except Exception as e:
+        raise RuntimeError(f"Failed to initialize ChatOpenAI: {e}")
+def get_history_retriever(llm, retriever, contextualize_q_prompt) -> object:
+    """
+    Creates a history-aware retriever using the provided LLM, retriever, and contextualization prompt.
+    Args:
+        llm: The language model used for generating context-aware queries.
+        retriever: The retriever instance for querying a vector store or similar.
+        contextualize_q_prompt: A prompt template for contextualizing queries.
+    Returns:
+        object: A history-aware retriever instance.
+    Raises:
+        ValueError: If any of the required inputs are None or invalid.
+    """
+    if not llm or not retriever or not contextualize_q_prompt:
+        raise ValueError(
+            "LLM, retriever, and contextualize_q_prompt must all be provided."
+        )
+    try:
+        return create_history_aware_retriever(llm, retriever, contextualize_q_prompt)
+    except Exception as e:
+        raise RuntimeError(f"Failed to create history-aware retriever: {e}")
+def get_system_prompt_chain(llm, qa_prompt) -> object:
+    """
+    Creates a prompt chain for processing system-level instructions with a question-answering prompt.
+    Args:
+        llm: The language model used for processing the system prompt.
+        qa_prompt: The prompt template for question-answering tasks.
+    Returns:
+        object: A chain instance for system prompt processing.
+    Raises:
+        ValueError: If either `llm` or `qa_prompt` is None.
+    """
+    if not llm or not qa_prompt:
+        raise ValueError("LLM and qa_prompt must both be provided.")
+    try:
+        return create_stuff_documents_chain(llm, qa_prompt)
+    except Exception as e:
+        raise RuntimeError(f"Failed to create system prompt chain: {e}")
+def get_rag_chain(history_aware_retriever, question_answer_chain) -> object:
+    """
+    Creates a Retrieval-Augmented Generation (RAG) chain using a history-aware retriever and a Q&A chain.
+    Args:
+        history_aware_retriever: A retriever configured to incorporate conversation history into queries.
+        question_answer_chain: A chain for handling question-answering tasks.
+    Returns:
+        object: A RAG chain instance.
+    Raises:
+        ValueError: If either `history_aware_retriever` or `question_answer_chain` is None.
+    """
+    if not history_aware_retriever or not question_answer_chain:
+        raise ValueError(
+            "Both history_aware_retriever and question_answer_chain must be provided."
+        )
+    try:
+        return create_retrieval_chain(history_aware_retriever, question_answer_chain)
+    except Exception as e:
+        raise RuntimeError(f"Failed to create RAG chain: {e}")
+def get_session_history(session_id: str, history_store: dict) -> BaseChatMessageHistory:
+    """
+    Retrieves or initializes the chat history for a given session ID.
+    Args:
+        session_id (str): The unique identifier for the session.
+        history_store (dict): A dictionary to store session histories.
+    Returns:
+        BaseChatMessageHistory: The chat message history for the session.
+    Raises:
+        ValueError: If `session_id` is not provided.
+    """
+    if not session_id:
+        raise ValueError("A valid session_id must be provided.")
+    if session_id not in history_store:
+        history_store[session_id] = ChatMessageHistory()
+    return history_store[session_id]
+def get_conversational_rag_chain(
+    rag_chain: Chain,
+    session_history_func: Callable[[str], BaseChatMessageHistory],
+) -> RunnableWithMessageHistory:
+    """
+    Creates a conversational Retrieval-Augmented Generation (RAG) chain with session history.
+    Args:
+        rag_chain (Chain): The RAG chain for handling retrieval and generation tasks.
+        session_history_func (Callable): A function to retrieve or initialize session history.
+    Returns:
+        RunnableWithMessageHistory: A chain that maintains message history and processes input/output.
+    Raises:
+        ValueError: If `rag_chain` or `session_history_func` is not provided.
+    """
+    if not rag_chain:
+        raise ValueError("A valid rag_chain must be provided.")
+    if not session_history_func:
+        raise ValueError("A valid session history function must be provided.")
+    return RunnableWithMessageHistory(
+        rag_chain,
+        session_history_func,
+        input_messages_key="input",
+        history_messages_key="chat_history",
+        output_messages_key="answer",
+    )
+def question_to_conversational_rag_chain(
+    user_query: str, conversational_rag_chain: Any, session_id: str = None
+) -> Dict[str, Any]:
+    """
+    Sends a user query to a conversational RAG chain and retrieves the response.
+    Args:
+        user_query (str): The query from the user.
+        conversational_rag_chain (Any): The conversational RAG chain instance.
+        session_id (str, optional): A unique identifier for the session. If not provided, a new session_id is generated.
+    Returns:
+        Dict[str, Any]: The response from the conversational RAG chain.
+    Raises:
+        ValueError: If the user query is empty or the RAG chain is not provided.
+        RuntimeError: If an error occurs during the invocation of the RAG chain.
+    """
+    if not user_query:
+        raise ValueError("The user query must be a non-empty string.")
+    if not conversational_rag_chain:
+        raise ValueError("A valid conversational RAG chain must be provided.")
+    # Generate a session_id if none is provided
+    if not session_id:
+        session_id = str(uuid.uuid4())
+    try:
+        # Invoke the conversational RAG chain
+        return conversational_rag_chain.invoke(
+            {"input": user_query}, config={"configurable": {"session_id": session_id}}
+        )
+    except Exception as e:
+        raise RuntimeError(f"Failed to process the query with the RAG chain: {e}")
+def get_documents_retrieve(output: Dict[str, Any], max_docs: int = 3) -> List[str]:
+    """
+    Retrieves the titles of the documents from the output context.
+    Args:
+        output (Dict[str, Any]): The output containing context and metadata.
+        max_docs (int): The maximum number of document titles to retrieve. Default is 3.
+    Returns:
+        List[str]: A list of document titles.
+    Raises:
+        ValueError: If the 'context' key is missing or empty in the output.
+    """
+    if "context" not in output:
+        return None
+    return [
+        output["context"][i].metadata.get("Title", "Untitled Document")
+        for i in range(min(len(output["context"]), max_docs))
+    ]  # TODO add filtre sur le type de documents à retourner
+def get_llm_answer(output: Dict[str, Any]) -> str:
+    """
+    Extracts the answer generated by the LLM from the output.
+    Args:
+        output (Dict[str, Any]): The output containing the answer.
+    Returns:
+        str: The LLM-generated answer.
+    Raises:
+        ValueError: If the 'answer' key is missing or empty in the output.
+    """
+    if "answer" not in output or not output["answer"]:
+        raise ValueError("The output does not contain a valid 'answer'.")
+    return output["answer"]
+def get_format_output(answer: str, context: List[str]) -> str:
+    """
+    Formats the LLM answer with a list of related document titles.
+    Args:
+        answer (str): The LLM-generated answer.
+        context (List[str]): A list of document titles related to the answer.
+    Returns:
+        str: A formatted string containing the answer and document references.
+    Raises:
+        ValueError: If the answer is empty or None.
+    """
+    if not answer:
+        raise ValueError("The 'answer' must be a non-empty string.")
+    formatted_output = f"{answer}"
+    if context:
+        uniques_doc = set(context)
+        formatted_output += (
+            "\n\nConsultez les documents suivants pour plus d'information:\n\n"
+        )
+        formatted_output += "\n\n".join(uniques_doc)
+    return formatted_output
+def clean_output(answer):  # TODO add clean process for output
+    pass
+def generate_summary(llm, conversation: List[dict]) -> str:
+    """
+    Generate a summary of the conversation with LangChain and append PDF references at the end.
+    Args:
+        conversation (List[dict]): List of dictionaries representing the conversation.
+                                   Each dictionary contains 'role' ('user' or 'assistant')
+                                   and 'content' (message string).
+        llm (str): OpenAI model to use.
+    Returns:
+        str: The generated summary with PDF references appended.
+    """
+    # Extract unique PDF references
+    pdf_references = extract_pdf_references(conversation)
+    # Prepare the messages
+    messages = summary_system_prompt
+    for message in conversation:
+        if message["role"] == "user":
+            messages.append(HumanMessage(content=message["content"]))
+        elif message["role"] == "assistant":
+            messages.append(AIMessage(content=message["content"]))
+    # Generate the summary
+    summary_prompt = ChatPromptTemplate.from_messages(messages).format()
+    summary = llm.invoke(summary_prompt)
+    # Append the PDF references
+    summary_text = summary.content
+    if pdf_references:
+        summary_text += (
+            "\n\nDocuments pdf à consulter pour plus d'information:"
+            + "\n".join(sorted(pdf_references))
+        )
+    return summary_text

backend/app/internal/parser.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import os
+from typing import Iterator, List, Union
+import openparse
+from docling.document_converter import DocumentConverter
+from langchain_core.document_loaders import BaseLoader
+from langchain_core.documents import Document as LCDocument
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from app.settings import Settings
+def get_pdf_paths(directory_or_file: Union[str, os.PathLike]) -> List[str]:
+    """
+    Retrieve all PDF file paths from a given directory, including its subdirectories, or from a single file.
+    Args:
+        directory_or_file (Union[str, os.PathLike]): Path to a directory or a single file.
+    Returns:
+        List[str]: A list of file paths to PDF files.
+    Raises:
+        FileNotFoundError: If the given path does not exist.
+        ValueError: If the input path is neither a directory nor a PDF file.
+    """
+    if not os.path.exists(directory_or_file):
+        raise FileNotFoundError(f"The path '{directory_or_file}' does not exist.")
+    pdf_paths = []
+    if os.path.isdir(directory_or_file):
+        for root, _, files in os.walk(directory_or_file):
+            for file in files:
+                if file.lower().endswith(".pdf"):
+                    pdf_paths.append(os.path.join(root, file))
+    elif os.path.isfile(directory_or_file):
+        if directory_or_file.lower().endswith(".pdf"):
+            pdf_paths.append(directory_or_file)
+        else:
+            raise ValueError(f"The file '{directory_or_file}' is not a PDF.")
+    else:
+        raise ValueError(
+            f"The path '{directory_or_file}' is neither a directory nor a valid file."
+        )
+    return pdf_paths
+settings = Settings()
+def parse_document(doc_path, parser=settings.parser):
+    if parser == "openparse":
+        parser = openparse.DocumentParser()
+        parsed_basic_doc = parser.parse(doc_path)
+        parsed_doc = [
+            node.text.replace("<br><br>", "\n") for node in parsed_basic_doc.nodes
+        ]
+    if parser == "docling":  # FIXME
+        converter = DocumentConverter()
+        parsed_doc = converter.convert(doc_path)
+        # loader = DoclingPDFLoader(file_path=doc_path)
+        # parsed_doc = loader.load()
+    return parsed_doc
+def split_documents(text_splitter, docs):
+    return text_splitter.split_documents(docs)
+def get_text_chunker():
+    return RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+# class DoclingPDFLoader(BaseLoader):
+#     def __init__(self, file_path: str | list[str]) -> None:
+#         self._file_paths = file_path if isinstance(
+#             file_path, list) else [file_path]
+#         self._converter = DocumentConverter()
+#     def lazy_load(self) -> Iterator[LCDocument]:
+#         for source in self._file_paths:
+#             dl_doc = self._converter.convert(source).document
+#             text = dl_doc.export_to_markdown()
+#             yield LCDocument(page_content=text)
+# loader = DoclingPDFLoader(file_path=path)
+# text_splitter = RecursiveCharacterTextSplitter(
+#     chunk_size=1000,
+#     chunk_overlap=200,
+# )
+# docs = loader.load()
+# splits = text_splitter.split_documents(docs)
+# splits

backend/app/internal/template_prompt.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from langchain_core.messages import SystemMessage  # noqa: D100
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+### Contextualize question ###
+contextualize_q_system_prompt = """Based on the provided chat history and the most
+    recent user question, your task is to reformulate the latest question
+    into a fully standalone version.
+    Ensure the reformulated question is clear, self-contained, and does not rely
+    on any prior context from the chat history to be understood.
+    If the latest question already functions as a standalone question,
+    return it unchanged.
+    Do NOT provide an answer to the question or interpret the user’s intent
+    beyond making the question self-contained.
+    Retain all technical details, key terms, and precision from the original
+    question in your reformulation.
+    Your sole output should be the reformulated standalone question,
+    or the original question if no reformulation is required."""
+contextualize_q_prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", contextualize_q_system_prompt),
+        MessagesPlaceholder("chat_history"),
+        ("human", "{input}"),
+    ]
+)
+### Answer question ###
+system_prompt = """You are an intelligent and professional assistant named 'Dis-ADEME',
+    created by the ADEME organization to assist with question-answering tasks related
+    to ecological transition, sustainable practices, and technical inquiries.
+    Use the provided retrieved context to answer the user's question accurately
+    and concisely.
+    If the retrieved context does not contain the necessary information,
+    explicitly state:
+    "Je suis désolé, je ne dispose pas des informations nécessaires
+    pour répondre à cette question."
+    Limit your response to a maximum of three sentences while maintaining clarity
+    and relevance. Ensure that your tone is formal and professional,
+    as your responses are intended for official use.
+    Do not speculate or provide information that is not explicitly supported
+    by the retrieved context.
+    Context:
+    {context}"""
+qa_prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", system_prompt),
+        MessagesPlaceholder("chat_history"),
+        ("human", "{input}"),
+    ]
+)
+### Conversation summary ###
+summary_report_system_prompt = """
+    You are a knowledgeable and professional French assistant named 'Dis-ADEME',
+    created by the ADEME organization.
+    Your task is to summarize in French the following conversation between a user and
+    an assistant, providing a structured, comprehensive, and detailed summary.
+    Focus exclusively on the content and technical details discussed in the conversation,
+    omitting any reference to the roles of the participants
+    (e.g., "user" or "assistant").
+    Present the information in clear, concise, and professional language,
+    suitable for inclusion in an official administrative report.
+    Emphasize critical technical details, key points of discussion,
+    and any actionable insights or conclusions derived from the conversation.
+    Organize the summary into sections or paragraphs if appropriate,
+    ensuring clarity and logical flow.
+    If the conversation references external documents or resources (e.g., PDFs),
+    include their titles or descriptions in a dedicated section at the end of the summary.
+    Do not include any conversational or informal elements; maintain
+    a formal and neutral tone throughout.
+    Output your response as a structured report in French, ready for official use.
+    """
+summary_system_prompt = [SystemMessage(content=summary_report_system_prompt)]

backend/app/main.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""Main module."""
+import logging
+from typing import Any
+import uvicorn
+from fastapi import FastAPI
+from app.routers.chatting import chat_router
+from app.routers.embedding import embedding_router
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+)
+logger = logging.getLogger(__name__)
+app = FastAPI()
+app.include_router(embedding_router)
+app.include_router(chat_router)
+@app.get("/")
+async def root() -> Any:  # noqa: ANN401
+    """Return greetings."""
+    return {"message": "Hello ADEME!"}
+if __name__ == "__main__":
+    uvicorn.run(app, log_level="info")

backend/app/resources/logo_ademe.png ADDED Viewed

backend/app/routers/__init__.py ADDED Viewed

File without changes

backend/app/routers/chatting.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import logging
+from typing import Any, Dict, List
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+from app.internal.bdd_manager import (
+    create_collection,
+    get_ensemble_retriever,
+    get_retriever,
+    get_vector_store,
+)
+from app.internal.embedder import get_embedder
+from app.internal.export_report import create_pdf_report
+from app.internal.llm_chat import (
+    generate_summary,
+    get_chat_llm,
+    get_conversational_rag_chain,
+    get_documents_retrieve,
+    get_format_output,
+    get_history_retriever,
+    get_llm_answer,
+    get_rag_chain,
+    get_session_history,
+    get_system_prompt_chain,
+    question_to_conversational_rag_chain,
+)
+from app.internal.template_prompt import contextualize_q_prompt, qa_prompt
+from app.settings import settings
+# Initialisation du logger
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+chat_router = APIRouter(
+    prefix="/chatting",
+    tags=["question_anwser"],
+    responses={404: {"description": "Not found"}},
+)
+class QueryRequest(BaseModel):
+    user_query: str
+    session_id: str = settings.session_id
+class ResponseOutput(BaseModel):
+    answer: str
+    context: List[str]
+    formatted_output: str
+class Conversation(BaseModel):
+    messages: List[Any]
+class ResponseOutputSum(BaseModel):
+    summary: str
+# Initialisation des ressources
+user_collection_name = settings.user_collection_name
+logger.info("Initializing collection: %s", user_collection_name)
+create_collection(user_collection_name)
+doc_collection_name = settings.doc_collection_name
+logger.info("Initializing collection: %s", doc_collection_name)
+create_collection(doc_collection_name)
+embedder = get_embedder(provider=settings.provider)
+logger.info("Embedder initialized.")
+doc_vector_store = get_vector_store(embedder, doc_collection_name)
+logger.info("Vector store initialized with collection: %s", doc_collection_name)
+user_vector_store = get_vector_store(embedder, user_collection_name)
+logger.info("Vector store initialized with collection: %s", user_collection_name)
+logger.info("Initializing LLM and retrievers...")
+llm = get_chat_llm()
+user_retriever = get_retriever(user_vector_store)
+doc_retriever = get_retriever(doc_vector_store)
+retriever = get_ensemble_retriever(doc_retriever, user_retriever)
+logger.info("Creating history-aware retriever...")
+history_retriever = get_history_retriever(llm, retriever, contextualize_q_prompt)
+logger.info("Creating system prompt chain...")
+qa_chain = get_system_prompt_chain(llm, qa_prompt)
+logger.info("Creating RAG chain...")
+rag_chain = get_rag_chain(history_retriever, qa_chain)
+logger.info("Initializing conversational RAG chain...")
+conversational_chain = get_conversational_rag_chain(
+    rag_chain,
+    lambda sid: get_session_history(settings.session_id, settings.history_store),
+)
+@chat_router.post("/chat", response_model=ResponseOutput)
+async def chat_with_rag_chain(request: QueryRequest):
+    """
+    Route pour interagir avec le RAG (Retrieval-Augmented Generation) Chain.
+    """
+    logger.info("Received chat request with session_id: %s", request.session_id)
+    logger.info("User query: %s", request.user_query)
+    try:
+        logger.info("Processing user query...")
+        response = question_to_conversational_rag_chain(
+            request.user_query, conversational_chain, request.session_id
+        )
+        logger.info("LLM response received: %s", response)
+        answer = get_llm_answer(response)
+        documents = get_documents_retrieve(response)
+        logger.info("Formatting output...")
+        formatted_output = get_format_output(answer, documents)
+        logger.info(
+            "Successfully processed chat request for session_id: %s", request.session_id
+        )
+        return {
+            "answer": answer,
+            "context": documents,
+            "formatted_output": formatted_output,
+        }
+    except ValueError as e:
+        logger.error("Validation error: %s", str(e))
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        logger.error("Internal server error: %s", str(e))
+        raise HTTPException(status_code=500, detail=f"Internal server error: {e}")
+@chat_router.get("/history/{session_id}")
+async def get_chat_history(session_id: str):
+    """
+    Route pour récupérer l'historique des messages pour une session donnée.
+    """
+    logger.info("Fetching chat history for session_id: %s", session_id)
+    try:
+        history = get_session_history(session_id, settings.history_store)
+        logger.info(
+            "Successfully retrieved chat history for session_id: %s", session_id
+        )
+        return {"session_id": session_id, "history": history.messages}
+    except ValueError as e:
+        logger.error("Validation error: %s", str(e))
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        logger.error("Internal server error while fetching history: %s", str(e))
+        raise HTTPException(status_code=500, detail=f"Internal server error: {e}")
+@chat_router.post("/chat", response_model=ResponseOutput)
+async def chat_with_rag_chain(request: QueryRequest):
+    """
+    Route pour interagir avec le RAG (Retrieval-Augmented Generation) Chain.
+    """
+    logger.info("Received chat request with session_id: %s", request.session_id)
+    logger.info("User query: %s", request.user_query)
+    try:
+        logger.info("Processing user query...")
+        response = question_to_conversational_rag_chain(
+            request.user_query, conversational_chain, request.session_id
+        )
+        answer = get_llm_answer(response)
+        documents = get_documents_retrieve(response)
+        logger.info("Formatting output...")
+        formatted_output = get_format_output(answer, documents)
+        logger.info(
+            "Successfully processed chat request for session_id: %s", request.session_id
+        )
+        return {
+            "answer": answer,
+            "context": documents,
+            "formatted_output": formatted_output,
+        }
+    except ValueError as e:
+        logger.error("Validation error: %s", str(e))
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        logger.error("Internal server error: %s", str(e))
+        raise HTTPException(status_code=500, detail=f"Internal server error: {e}")
+@chat_router.post("/summary", response_model=ResponseOutputSum)
+async def summarize_conversation(conversation: Conversation):
+    """
+    Génère un résumé de la conversation et liste les documents PDF référencés.
+    Args:
+        conversation (Conversation): Objet contenant les messages de la conversation.
+    Returns:
+        dict: Résumé de la conversation et liste des documents PDF référencés.
+    """
+    outpur_path = r"..\Shared_data\export.pdf"
+    # outpur_path = r"C:\Users\jeanb\Documents\kzs-team\Shared_data\export.pdf"
+    logo_path = r"app\resources\logo_ademe.png"
+    summary_text = generate_summary(llm, conversation.messages)
+    create_pdf_report(outpur_path, logo_path, summary_text)
+    return {"summary": summary_text}

backend/app/routers/embedding.py ADDED Viewed

	@@ -0,0 +1,169 @@

+"""Embedding tools"""
+import logging
+from typing import List
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+from app.internal.bdd_manager import create_collection, get_vector_store
+from app.internal.embedder import get_embedder
+from app.internal.parser import get_pdf_paths, get_text_chunker, parse_document
+from app.settings import settings
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+embedding_router = APIRouter(
+    prefix="/embeddings",
+    tags=["documents"],
+    responses={404: {"description": "Not found"}},
+)
+user_collection_name = settings.user_collection_name
+logger.info("Initializing collection: %s", user_collection_name)
+create_collection(user_collection_name)
+doc_collection_name = settings.doc_collection_name
+logger.info("Initializing collection: %s", doc_collection_name)
+create_collection(doc_collection_name)
+embedder = get_embedder(provider=settings.provider)
+logger.info("Embedder initialized.")
+doc_vector_store = get_vector_store(embedder, doc_collection_name)
+logger.info("Vector store initialized with collection: %s", doc_collection_name)
+user_vector_store = get_vector_store(embedder, user_collection_name)
+logger.info("Vector store initialized with collection: %s", user_collection_name)
+text_splitter = get_text_chunker()
+logger.info("Text splitter initialized.")
+def get_vectorstore(vectorstor_type):
+    if vectorstor_type == "user":
+        return user_vector_store
+    if vectorstor_type == "doc":
+        return doc_vector_store
+    return None
+class DocPathsInput(BaseModel):  # TODO move to schema.py
+    doc_paths: str
+    vectorstor_type: str
+@embedding_router.post("/embedded/")
+async def embedding(doc_paths_input: DocPathsInput):
+    """
+    Embeds documents provided via file paths and adds them to the vector store.
+    Args:
+        doc_paths_input (DocPathsInput): A Pydantic model containing
+        a list of document file paths.
+    Returns:
+        dict: A response containing the number of documents added to the vector store.
+    Raises:
+        HTTPException: If the document parsing or embedding process fails.
+    """
+    logger.info("Received request to embed documents: %s", doc_paths_input.doc_paths)
+    vector_store = get_vectorstore(doc_paths_input.vectorstor_type)
+    try:
+        folder_path = doc_paths_input.doc_paths
+        logger.info(folder_path)
+        doc_paths = get_pdf_paths(folder_path)
+        logger.info(doc_paths)
+        for path in doc_paths:
+            try:
+                logger.info("Parsing document at path: %s", path)
+                parsed_documents = parse_document(path)
+                doc_title = path.split("\\")[-1]
+                logger.info("Document parsed: %s", doc_title)
+                documents = text_splitter.create_documents(
+                    parsed_documents,
+                    metadatas=[
+                        {"Title": doc_title} for _ in range(len(parsed_documents))
+                    ],
+                )
+                logger.info(
+                    "Created %d document chunks for: %s", len(documents), doc_title
+                )
+                vector_store.add_documents(documents)
+                logger.info("Documents added to vector store: %s", doc_title)
+            except Exception as e:
+                logger.info(
+                    f"An error occured during the parsing of the file {path}: {e}"
+                )
+        logger.info("All documents successfully processed and embedded.")
+        return {
+            "message": "Documents successfully embedded and stored",
+            "documents_added": len(doc_paths),
+        }
+    except Exception as e:
+        logger.error("An error occurred during the embedding process: %s", e)
+        raise HTTPException(status_code=500, detail=f"An error occurred: {e!s}")
+class SearchQuery(BaseModel):  # TODO move to schema.py
+    vectorstor_type: str
+    query: str
+    k: int = 2
+@embedding_router.post("/similarity_search/")
+async def search_documents(search_query: SearchQuery):
+    """
+    Search for documents in the vector store based on a query.
+    Args:
+        search_query (SearchQuery): A Pydantic model containing the query string and the number of results (k).
+    Returns:
+        List[dict]: A list of documents matching the query, including their content and metadata.
+    Raises:
+        HTTPException: If the search process fails or no documents are found.
+    """
+    logger.info("Received similarity search query: %s", search_query.query)
+    vector_store = get_vectorstore(search_query.vectorstor_type)
+    try:
+        found_docs = vector_store.similarity_search(
+            search_query.query, k=search_query.k
+        )
+        logger.info(
+            "Found %d documents for query: %s", len(found_docs), search_query.query
+        )
+        if not found_docs:
+            logger.warning("No documents found for query: %s", search_query.query)
+            raise HTTPException(
+                status_code=404, detail="No documents found for the given query."
+            )
+        logger.info("Returning results for query: %s", search_query.query)
+        return [
+            {
+                "content": doc.page_content,
+                "metadata": doc.metadata if hasattr(doc, "metadata") else None,
+            }
+            for doc in found_docs
+        ]
+    except Exception as e:
+        logger.error("An error occurred during the similarity search: %s", e)
+        raise HTTPException(
+            status_code=500, detail=f"An error occurred during the search: {e}"
+        )

backend/app/settings.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import os
+from pydantic_settings import BaseSettings, SettingsConfigDict
+class Settings(BaseSettings):
+    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
+    llm_model_name: str = "HuggingFaceH4/zephyr-7b-beta"
+    context_window_size: int = 5
+    retrieval_top_k: int = 3
+    temperature: float = 0.2
+    max_length: int = 2048
+    hf_token: str = os.getenv("HF_TOKEN")
+    if not hf_token:
+        raise ValueError(
+            "ERREUR : Le token Hugging Face (HF_TOKEN) n'est pas défini ! Ajoute-le dans les variables d'environnement Hugging Face Spaces."
+        )
+    embedding_model_name: str = "sentence-transformers/sentence-t5-xxl"
+    # qdrant_url: str = "http://qdrant:6333"
+    qdrant_url: str = "http://localhost:6333"
+    parser: str = "openparse"
+    history_store: dict = {}
+    session_id: str = "user012025"
+    user_collection_name: str = "User_Ademe_collection"
+    doc_collection_name: str = "Doc_Ademe_collection"
+    provider: str = "hf_api"
+settings = Settings()

backend/test/test_main.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Basic test."""
+def test_basic() -> None:
+    """Test that 1 + 1 equals 2."""
+    assert 1 + 1 == 2

dockerignore ADDED Viewed

	@@ -0,0 +1,17 @@

+dist/
+env/
+presentation/
+__pycache__/
+.coverage/
+.doit.db
+.git/
+.gitignore
+.idea/
+.mypy_cache/
+.pytest_cache/
+.ruff_cache/
+.venv/
+.gitlab-ci.yml
+renovate.json
+Dockerfile
+dodo.py

frontend/app/__init__.py ADDED Viewed

File without changes

frontend/app/main.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import os
+import base64
+from pathlib import Path
+import streamlit as st
+import requests
+from settings import settings
+BASE_DIR = str(Path(__file__).resolve().parent)
+# API_URL_CHAT = "http://localhost:8088/chatting/chat"
+# API_URL_EMBEDDING = "http://localhost:8088/embeddings/embedded"
+# API_URL_SUM = "http://localhost:8088/chatting/summary"
+API_URL_CHAT = "http://localhost/api/chatting/chat"
+API_URL_EMBEDDING = "http://localhost/api/embeddings/embedded"
+API_URL_SUM = "http://localhost/api/chatting/summary"
+st.set_page_config(
+    page_title="CV_JBDENIS",
+    page_icon="🧊",
+)
+# Helper functions for background
+def get_base64_of_bin_file(bin_file):  # noqa: ANN001, ANN201, D103
+    with open(bin_file, "rb") as f:
+        data = f.read()
+    return base64.b64encode(data).decode()
+def set_png_as_page_bg(png_file) -> None:  # noqa: ANN001, D103
+    bin_str = get_base64_of_bin_file(png_file)
+    page_bg_img = (
+        """
+            <style>
+            .stApp {
+            background-image: url("data:image/png;base64,%s");
+            background-size: cover;
+            }
+            </style>
+        """  # noqa: UP031
+        % bin_str
+    )
+    st.markdown(page_bg_img, unsafe_allow_html=True)
+    return  # noqa: PLR1711
+# Set background
+set_png_as_page_bg(png_file=r"app\resources\aide-financiere-ademe.jpg")
+logo_path = r"app\resources\logo_ademe.png"
+col1, col2 = st.columns([3, 2])
+with col1:
+    st.image(logo_path, width=400)
+with col2:
+    st.title("Dis-ADEME")
+    st.write("Bienvenue dans votre application de chat.")
+# Navigation
+st.sidebar.title("Menu")
+page = st.sidebar.radio("Navigation", ["Accueil", "Admin"])
+def save_uploaded_files(uploaded_files: list):  # noqa: ANN201, D103
+    save_dir = BASE_DIR + r"\uploaded_files\user"
+    # save_dir = r"\Shared_data\uploaded_files"
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    saved_file_paths = []
+    for uploaded_file in uploaded_files:
+        file_path = os.path.join(save_dir, uploaded_file.name)
+        with open(file_path, "wb") as f:
+            f.write(uploaded_file.getbuffer())
+        saved_file_paths.append(file_path)
+        st.session_state.uploaded_files.append(file_path)
+    return saved_file_paths
+# Page d'accueil
+if page == "Accueil":
+    if "uploaded_files" not in st.session_state:
+        st.session_state.uploaded_files = []
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+    saved_paths = []
+    with st.sidebar:
+        st.header("Uploader des fichiers PDF")
+        uploaded_files = st.file_uploader(
+            "Choisissez des fichiers PDF",
+            type="pdf",
+            accept_multiple_files=True,
+            key="pdf_uploader",
+        )
+        if uploaded_files:
+            saved_paths = save_uploaded_files(uploaded_files)
+            st.success(f"Fichiers sauvegardés : {saved_paths[-1]}, en analyse ...")
+        if saved_paths:
+            try:
+                response = requests.post(
+                    API_URL_EMBEDDING,
+                    json={"doc_paths": saved_paths[-1], "vectorstor_type": "user"},
+                )
+                response.raise_for_status()
+                embedded = response.json().get(
+                    "message",
+                    "Désolé, une erreur s'est produite durant la lecture du fichier.",
+                )
+                if response:
+                    st.success(f"Analyse du fichiers {saved_paths[-1]} terminée.")
+                saved_paths = []
+            except requests.RequestException as e:
+                embedded = f"Erreur lors de la communication avec l'API : {e}"
+        if st.session_state.messages:
+            st.write("")
+            st.divider()
+            st.write("")
+            st.header("Rapport de conversation")
+            if st.button("Générer le rapport de conversation"):
+                try:
+                    response = requests.post(
+                        API_URL_SUM, json={"messages": st.session_state.messages}
+                    )
+                    response.raise_for_status()
+                    summary = response.json().get("summary", "Résumé non disponible.")
+                    st.subheader("Résumé généré")
+                    st.text_area("Rapport", summary, height=200)
+                except requests.exceptions.RequestException as e:
+                    st.error(f"Erreur lors de l'appel de l'API : {e}")
+                if response:
+                    with open(r"..\Shared_data\export.pdf", "rb") as pdf_file:
+                        # with open(r"C:\Users\jeanb\Documents\kzs-team\Shared_data\export.pdf", "rb") as pdf_file:
+                        PDFbyte = pdf_file.read()
+                    if PDFbyte:
+                        st.download_button(
+                            label="Télécharger le rapport de conversation",
+                            data=PDFbyte,
+                            file_name="Conversation_Dis_ADEME.pdf",
+                            mime="application/octet-stream",
+                        )
+    # Chatbot
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"], avatar=message["avatar"]):
+            st.write(message["content"])
+    if prompt := st.chat_input("Comment puis-je vous aider ?"):
+        st.session_state.messages.append(
+            {"role": "user", "content": prompt, "avatar": "👤"}
+        )
+        with st.chat_message("user", avatar="👤"):
+            st.write(prompt)
+        try:
+            response = requests.post(API_URL_CHAT, json={"user_query": prompt})
+            response.raise_for_status()
+            data = response.json()
+            answer = data.get(
+                "formatted_output", "Désolé, je n'ai pas de réponse à cette question."
+            )
+        except requests.RequestException as e:
+            answer = f"Erreur lors de la communication avec l'API : {e}"
+        st.session_state.messages.append(
+            {"role": "assistant", "content": answer, "avatar": "🤖"}
+        )
+        with st.chat_message("assistant", avatar="🤖"):
+            st.write(answer)
+# Page Admin
+elif page == "Admin":
+    st.title("Admin - Ajouter des documents à la base de données")
+    doc_path = st.text_input("Entrez le chemin du document ou du dossier à ajouter")
+    if st.button("Ajouter les documents PDF à la base de données"):
+        if doc_path:
+            print("SAVED DOC:", doc_path)
+            try:
+                response = requests.post(
+                    API_URL_EMBEDDING,
+                    json={"doc_paths": doc_path, "vectorstor_type": "doc"},
+                )
+                response.raise_for_status()
+                st.success("Documents ajoutés à la base de données avec succès.")
+            except requests.RequestException as e:
+                st.error(f"Erreur lors de l'ajout des documents : {e}")
+        else:
+            st.warning("Veuillez entrer un chemin valide.")

frontend/app/resources/aide-financiere-ademe.JPG ADDED Viewed

frontend/app/resources/logo_ademe.png ADDED Viewed

frontend/app/settings.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from pydantic_settings import BaseSettings, SettingsConfigDict
+class Settings(BaseSettings):
+    # model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
+    api_url_chat: str = "http://backend/chatting/chat"
+    api_url_embedding: str = "http://backend/embeddings/embedded"
+    api_url_sum: str = "http://backend/chatting/summary"
+settings = Settings()

frontend/test/test_main.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Basic test."""
+def test_basic() -> None:
+    """Test that 1 + 1 equals 2."""
+    assert 1 + 1 == 2

nginx.conf ADDED Viewed

	@@ -0,0 +1,36 @@

+user user;
+worker_processes 1;
+events {
+    worker_connections 1024;
+}
+http {
+    server {
+        listen 80;
+        location /api/ {
+            proxy_pass http://127.0.0.1:8000/;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+        }
+        location / {
+            proxy_pass http://127.0.0.1:8501/;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+        }
+        location /qdrant/ {
+            proxy_pass http://127.0.0.1:6333/;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+        }
+    }
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,144 @@

+[project]
+name = "kzs-rag"
+version = "0.1.0"
+description = "RAG project by kaizen"
+readme = "readme.md"
+requires-python = ">=3.12,<3.13"
+dependencies = [
+    "pydantic-settings>=2.6.1",
+]
+[dependency-groups]
+frontend = [
+    "streamlit>=1.40.1",
+]
+backend = [
+    "docling>=2.8.1",
+    "fastapi[standard]>=0.115.4",
+    "langchain-community>=0.3.8",
+    "langchain-openai>=0.2.10",
+    "langchain-qdrant>=0.2.0",
+    "langgraph>=0.2.53",
+    "qdrant-client>=1.12.1",
+    "sentence-transformers>=3.3.1",
+    "openparse>=0.7.0",
+    # "fpdf>=1.7.2",
+    "fpdf2>=2.8.1",
+]
+dev = [
+    "mypy>=1.13.0",
+    "pytest>=8.3.3",
+    "ruff>=0.7.1",
+    "pytest-forked>=1.6.0",
+    "pytest-gitignore>=1.3",
+    "pytest-html>=4.1.1",
+    "pytest-xdist>=3.6.1",
+    "pandas>=2.2.3",
+    "pandas-stubs>=2.2.3.241009",
+    # "gitlabci-local>=10.2.0",
+    "plotly>=5.24.1",
+    "ipykernel>=6.29.5",
+]
+[tool.ruff]
+target-version = "py312"
+fix = false
+line-length = 88  # Same as Black
+exclude = [
+    ".git",
+    ".git-rewrite",
+    ".mypy_cache",
+    ".pytype",
+    ".ruff_cache",
+    "__pypackages__",
+    ".venv"
+]
+[tool.ruff.lint]
+fixable = ["ALL"] # Allow autofix for all enabled rules
+unfixable = []
+# Rule selection
+select = [
+ "F", "E", "C90", "N", "D", "UP", "YTT", "ANN", "ASYNC", "S", "BLE",
+ "FBT", "B", "A", "C4", "DTZ", "T10", "DJ", "EXE", "FA", "ISC",
+ "ICN", "G", "INP", "PIE", "PYI", "PT", "Q", "RSE", "RET", "SLF", "SLOT", "SIM",
+ "TID", "TCH", "INT", "ARG", "PTH", "TD", "FIX", "ERA", "PD", "PGH", "PL", "TRY",
+ "FLY", "NPY", "AIR", "PERF", "RUF", "T20", "I"
+]
+# Not selected:
+# - CPY (flake8-copyright) no need of a copyright per file
+# - COM (flake8-commas) handled by ruff
+# - EM  (flake8-errmsg) too little gain for the cost
+ignore = [
+    "D203", # 1 blank line required before class docstring
+    "D212", # Multi-line docstring summary should start at the first line
+    "TRY003", # Avoid specifying long messages outside the exception class
+    "ANN101", # Missing type annotation for self in method
+    "ANN102", # Missing type annotation for cls in classmethod
+    "G004", # Logging statement uses f-string
+    "PD013", # `.melt` is preferred to `.stack`; provides same functionality (WRONG!)
+]
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["D104"] # Ignore "missing docstring in public package" in all `__init__.py` files
+"test/**/*.py" = [  # Ignore rules necessary for tests
+    "INP001", # Ignore "File is part of an implicit namespace package. Add an `__init__.py`."
+    "S101", # Ignore "Use of `assert` detected" because pytest relies on assert
+    "N802", # Ignore "Function name should be lowercase" because test function are non-standard
+#     "ARG", # Unused function args -> fixtures nevertheless are functionally relevant...
+#     "FBT", # Don't care about booleans as positional arguments in tests, e.g. via @pytest.mark.parametrize()
+    "PLR2004", # Ignore "Magic value used in comparison"
+#     "S311", # Standard pseudo-random generators are not suitable for cryptographic purposes
+]
+[tool.ruff.lint.flake8-annotations]
+mypy-init-return = true
+[tool.ruff.lint.pydocstyle]
+convention = "numpy"
+[tool.mypy]
+python_version = "3.12"
+exclude = [
+]
+[[tool.mypy.overrides]]
+module = [""
+]
+ignore_missing_imports = true
+[tool.pytest.ini_options]
+addopts = [
+    "--import-mode=prepend",
+    "-vv",
+    "--exitfirst",
+    "--capture=no",
+    "--showlocals",
+    # "--forked",
+    # "--cov-config=.coverage/coveragerc",
+    # "--cov=src",
+    # "--cov=app",
+    # "--cov-report=html",
+    "--html=.pytest_cache/report.html",
+]
+python_files = "*.py"
+norecursedirs = [
+    "dist",
+    "doc",
+    "__pycache__",
+]
+[tool.pymarkdown]
+# plugins.line-length.line_length = 88
+# plugins.ul-style.style = "sublist"
+# extensions.front-matter.enabled = true

supervisord.conf ADDED Viewed

	@@ -0,0 +1,28 @@

+[supervisord]
+nodaemon=true
+[program:nginx]
+command=/usr/sbin/nginx -g "daemon off;"
+autostart=true
+autorestart=true
+user=user
+[program:qdrant]
+command=/usr/local/bin/qdrant
+autostart=true
+autorestart=true
+user=user
+[program:backend]
+command=uv run fastapi app.main:app --host 0.0.0.0 --port 8000
+directory=/home/user/app/backend
+autostart=true
+autorestart=true
+user=user
+[program:frontend]
+command=uv run streamlit run app/main.py --server.port 8501 --server.address=0.0.0.0
+directory=/home/user/app/frontend
+autostart=true
+autorestart=true
+user=user

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff