jdenis-insn
commited on
Commit
·
737f55b
1
Parent(s):
de91b1c
init commit for build
Browse files- .gitignore +168 -0
- Dockerfile +43 -0
- README.md +2 -2
- backend/app/__init__.py +3 -0
- backend/app/internal/__init__.py +0 -0
- backend/app/internal/bdd_manager.py +129 -0
- backend/app/internal/embedder.py +26 -0
- backend/app/internal/export_report.py +95 -0
- backend/app/internal/llm_chat.py +346 -0
- backend/app/internal/parser.py +104 -0
- backend/app/internal/template_prompt.py +79 -0
- backend/app/main.py +32 -0
- backend/app/resources/logo_ademe.png +0 -0
- backend/app/routers/__init__.py +0 -0
- backend/app/routers/chatting.py +215 -0
- backend/app/routers/embedding.py +169 -0
- backend/app/settings.py +30 -0
- backend/test/test_main.py +6 -0
- dockerignore +17 -0
- frontend/app/__init__.py +0 -0
- frontend/app/main.py +200 -0
- frontend/app/resources/aide-financiere-ademe.JPG +0 -0
- frontend/app/resources/logo_ademe.png +0 -0
- frontend/app/settings.py +11 -0
- frontend/test/test_main.py +6 -0
- nginx.conf +36 -0
- pyproject.toml +144 -0
- supervisord.conf +28 -0
- uv.lock +0 -0
.gitignore
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
110 |
+
.pdm.toml
|
111 |
+
.pdm-python
|
112 |
+
.pdm-build/
|
113 |
+
|
114 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
115 |
+
__pypackages__/
|
116 |
+
|
117 |
+
# Celery stuff
|
118 |
+
celerybeat-schedule
|
119 |
+
celerybeat.pid
|
120 |
+
|
121 |
+
# SageMath parsed files
|
122 |
+
*.sage.py
|
123 |
+
|
124 |
+
# Environments
|
125 |
+
.env*
|
126 |
+
.venv
|
127 |
+
env/
|
128 |
+
venv/
|
129 |
+
ENV/
|
130 |
+
env.bak/
|
131 |
+
venv.bak/
|
132 |
+
|
133 |
+
# Spyder project settings
|
134 |
+
.spyderproject
|
135 |
+
.spyproject
|
136 |
+
|
137 |
+
# Rope project settings
|
138 |
+
.ropeproject
|
139 |
+
|
140 |
+
# mkdocs documentation
|
141 |
+
/site
|
142 |
+
|
143 |
+
# mypy
|
144 |
+
.mypy_cache/
|
145 |
+
.dmypy.json
|
146 |
+
dmypy.json
|
147 |
+
|
148 |
+
# Pyre type checker
|
149 |
+
.pyre/
|
150 |
+
|
151 |
+
# pytype static type analyzer
|
152 |
+
.pytype/
|
153 |
+
|
154 |
+
# Cython debug symbols
|
155 |
+
cython_debug/
|
156 |
+
|
157 |
+
# PyCharm
|
158 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
159 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
160 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
161 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
162 |
+
#.idea/
|
163 |
+
|
164 |
+
# BDD
|
165 |
+
qdrant_storage/
|
166 |
+
memory
|
167 |
+
|
168 |
+
*.pdf
|
Dockerfile
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Étape 1 : Image de base et installation des dépendances système
|
2 |
+
FROM python:3.12-slim as base
|
3 |
+
|
4 |
+
RUN apt-get update && apt-get install -y \
|
5 |
+
nginx \
|
6 |
+
supervisor \
|
7 |
+
curl \
|
8 |
+
&& rm -rf /var/lib/apt/lists/*
|
9 |
+
|
10 |
+
# Étape 2 : Création de l'utilisateur 'user' avec l'ID 1000
|
11 |
+
RUN useradd -m -u 1000 user
|
12 |
+
|
13 |
+
# Étape 3 : Définition des variables d'environnement et du répertoire de travail
|
14 |
+
ENV HOME=/home/user \
|
15 |
+
PATH=/home/user/.local/bin:$PATH
|
16 |
+
WORKDIR $HOME/app
|
17 |
+
|
18 |
+
# Étape 4 : Installation de 'uv' (gestionnaire de projet Python)
|
19 |
+
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
|
20 |
+
|
21 |
+
# Étape 5 : Copie des fichiers de configuration avec les permissions appropriées
|
22 |
+
COPY --chown=user:user nginx.conf /etc/nginx/nginx.conf
|
23 |
+
COPY --chown=user:user supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
24 |
+
|
25 |
+
# Étape 6 : Copie des fichiers de l'application avec les permissions appropriées
|
26 |
+
COPY --chown=user:user . $HOME/app
|
27 |
+
|
28 |
+
# Étape 7 : Installation des dépendances Python
|
29 |
+
COPY --chown=user:user pyproject.toml uv.lock ./
|
30 |
+
RUN uv sync --no-dev --frozen --no-cache
|
31 |
+
|
32 |
+
# Étape 8 : Téléchargement et installation de Qdrant
|
33 |
+
RUN curl -fsSL https://github.com/qdrant/qdrant/releases/latest/download/qdrant-linux-x86_64 -o /usr/local/bin/qdrant \
|
34 |
+
&& chmod +x /usr/local/bin/qdrant
|
35 |
+
|
36 |
+
# Étape 9 : Exposition des ports nécessaires
|
37 |
+
EXPOSE 80 6333
|
38 |
+
|
39 |
+
# Étape 10 : Changement de l'utilisateur pour 'user'
|
40 |
+
USER user
|
41 |
+
|
42 |
+
# Étape 11 : Lancement de l'application avec supervisord
|
43 |
+
CMD ["supervisord", "-n", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
|
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 👁
|
4 |
colorFrom: indigo
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
license: unlicense
|
|
|
1 |
---
|
2 |
+
title: CV_JBDENIS
|
3 |
emoji: 👁
|
4 |
colorFrom: indigo
|
5 |
+
colorTo: red
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
license: unlicense
|
backend/app/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
def hello() -> str:
|
2 |
+
"""Return greetings."""
|
3 |
+
return "Hello from my-app!"
|
backend/app/internal/__init__.py
ADDED
File without changes
|
backend/app/internal/bdd_manager.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any
|
2 |
+
from langchain_qdrant import QdrantVectorStore
|
3 |
+
from qdrant_client import QdrantClient
|
4 |
+
from qdrant_client.http.models import Distance, VectorParams
|
5 |
+
from qdrant_client.http.exceptions import UnexpectedResponse
|
6 |
+
from qdrant_client.http.models import CollectionStatus
|
7 |
+
from langchain.vectorstores.base import VectorStoreRetriever
|
8 |
+
from langchain.retrievers import EnsembleRetriever
|
9 |
+
from langchain_core.vectorstores import VectorStore
|
10 |
+
from app.settings import settings
|
11 |
+
|
12 |
+
|
13 |
+
try:
|
14 |
+
client = QdrantClient(url=settings.qdrant_url)
|
15 |
+
|
16 |
+
except Exception as e:
|
17 |
+
raise Exception(f"Error connecting to Qdrant: {e}")
|
18 |
+
|
19 |
+
|
20 |
+
def create_collection(collection_name: str):
|
21 |
+
"""
|
22 |
+
Create a collection in Qdrant if it does not already exist.
|
23 |
+
|
24 |
+
Args:
|
25 |
+
collection_name (str): The name of the collection to be created.
|
26 |
+
|
27 |
+
Returns:
|
28 |
+
str: A message indicating the result of the operation.
|
29 |
+
|
30 |
+
Raises:
|
31 |
+
Exception: If there is an error during the collection creation process.
|
32 |
+
"""
|
33 |
+
try:
|
34 |
+
existing_collections = client.get_collections()
|
35 |
+
if any(col.name == collection_name for col in existing_collections.collections):
|
36 |
+
return f"Collection '{collection_name}' already exists."
|
37 |
+
|
38 |
+
client.create_collection(
|
39 |
+
collection_name=collection_name,
|
40 |
+
vectors_config=VectorParams(size=768, distance=Distance.COSINE),
|
41 |
+
)
|
42 |
+
return f"Collection '{collection_name}' created successfully."
|
43 |
+
|
44 |
+
except Exception as e:
|
45 |
+
raise Exception(f"Error creating collection '{collection_name}': {e}")
|
46 |
+
|
47 |
+
|
48 |
+
def get_vector_store(embeddings, collection_name):
|
49 |
+
"""
|
50 |
+
Retrieve or initialize a Qdrant vector store for a given collection.
|
51 |
+
|
52 |
+
Args:
|
53 |
+
embeddings: The embedding model or function to be used for vectorization.
|
54 |
+
collection_name (str): The name of the Qdrant collection.
|
55 |
+
|
56 |
+
Returns:
|
57 |
+
QdrantVectorStore: A Qdrant vector store object tied to the specified collection.
|
58 |
+
|
59 |
+
Raises:
|
60 |
+
Exception: If the collection does not exist or there is an issue accessing it.
|
61 |
+
"""
|
62 |
+
try:
|
63 |
+
collection_info = client.get_collection(collection_name)
|
64 |
+
|
65 |
+
if collection_info.status != CollectionStatus.GREEN:
|
66 |
+
raise Exception(
|
67 |
+
f"Collection '{collection_name}' is not active (status: {
|
68 |
+
collection_info.status})."
|
69 |
+
)
|
70 |
+
|
71 |
+
return QdrantVectorStore(
|
72 |
+
client=client, collection_name=collection_name, embedding=embeddings
|
73 |
+
)
|
74 |
+
|
75 |
+
except UnexpectedResponse as e:
|
76 |
+
raise Exception(
|
77 |
+
f"Collection '{
|
78 |
+
collection_name}' does not exist or could not be accessed: {e}"
|
79 |
+
)
|
80 |
+
|
81 |
+
except Exception as e:
|
82 |
+
raise Exception(
|
83 |
+
f"An error occurred while retrieving the vector store for '{
|
84 |
+
collection_name}': {e}"
|
85 |
+
)
|
86 |
+
|
87 |
+
|
88 |
+
def get_retriever(vector_store: VectorStore) -> VectorStoreRetriever:
|
89 |
+
"""
|
90 |
+
Converts a vector store into a retriever instance.
|
91 |
+
|
92 |
+
Args:
|
93 |
+
vector_store: An object that represents the vector store. It must have an `as_retriever` method.
|
94 |
+
|
95 |
+
Returns
|
96 |
+
-------
|
97 |
+
VectorStoreRetriever: An instance of VectorStoreRetriever for querying the vector store.
|
98 |
+
|
99 |
+
Raises
|
100 |
+
------
|
101 |
+
AttributeError: If the provided vector store does not have an `as_retriever` method.
|
102 |
+
""" # noqa: D401
|
103 |
+
if not hasattr(vector_store, "as_retriever"):
|
104 |
+
raise AttributeError(
|
105 |
+
"The provided vector store does not have an 'as_retriever' method."
|
106 |
+
)
|
107 |
+
|
108 |
+
return vector_store.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.7})
|
109 |
+
|
110 |
+
|
111 |
+
def get_ensemble_retriever(
|
112 |
+
retriever_doc: VectorStoreRetriever, retriever_user: VectorStoreRetriever
|
113 |
+
) -> EnsembleRetriever:
|
114 |
+
"""
|
115 |
+
Create an ensemble retriever that combines two retrievers with specified weights.
|
116 |
+
|
117 |
+
Args:
|
118 |
+
retriever_doc (VectorStoreRetriever): The first retriever,
|
119 |
+
typically for document retrieval.
|
120 |
+
retriever_user (VectorStoreRetriever): The second retriever,
|
121 |
+
typically for user-specific retrieval.
|
122 |
+
|
123 |
+
Returns:
|
124 |
+
EnsembleRetriever: An instance of `EnsembleRetriever` combining the two retrievers
|
125 |
+
with the specified weights (0.2 for `retriever_doc` and 0.8 for `retriever_user`).
|
126 |
+
"""
|
127 |
+
return EnsembleRetriever(
|
128 |
+
retrievers=[retriever_doc, retriever_user], weights=[0.2, 0.8]
|
129 |
+
)
|
backend/app/internal/embedder.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_openai import OpenAIEmbeddings
|
2 |
+
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
|
3 |
+
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
|
4 |
+
from app.settings import settings
|
5 |
+
|
6 |
+
|
7 |
+
def get_embedder(provider: str = "hf_api"):
|
8 |
+
if provider == "hf_local":
|
9 |
+
return HuggingFaceEmbeddings(
|
10 |
+
model_name=settings.embedding_model_name,
|
11 |
+
)
|
12 |
+
|
13 |
+
if provider == "hf_api":
|
14 |
+
return HuggingFaceInferenceAPIEmbeddings(
|
15 |
+
model_name=settings.embedding_model_name,
|
16 |
+
api_key=settings.hf_token,
|
17 |
+
)
|
18 |
+
|
19 |
+
if provider == "openai":
|
20 |
+
return OpenAIEmbeddings(
|
21 |
+
openai_api_key=settings.scw_api_key,
|
22 |
+
openai_api_base=settings.scw_generative_apis_endpoint,
|
23 |
+
model=settings.embedding_model_name,
|
24 |
+
tiktoken_enabled=False,
|
25 |
+
)
|
26 |
+
return None
|
backend/app/internal/export_report.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import re
|
3 |
+
from typing import List
|
4 |
+
from fpdf import FPDF
|
5 |
+
from datetime import datetime
|
6 |
+
|
7 |
+
# Initialisation du logger
|
8 |
+
logging.basicConfig(level=logging.INFO)
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
|
11 |
+
|
12 |
+
def create_pdf_report(output_path: str, logo_path: str, report_text: str):
|
13 |
+
"""
|
14 |
+
Creates a PDF report with a logo, the current date, and a given text.
|
15 |
+
|
16 |
+
Args:
|
17 |
+
output_path (str): The path where the generated PDF will be saved.
|
18 |
+
logo_path (str): The path to the logo image to include in the report.
|
19 |
+
report_text (str): The text content to include in the report.
|
20 |
+
|
21 |
+
Returns
|
22 |
+
-------
|
23 |
+
None: The function saves the PDF to the specified output path.
|
24 |
+
|
25 |
+
Raises
|
26 |
+
------
|
27 |
+
FileNotFoundError: If the logo file does not exist.
|
28 |
+
ValueError: If the provided paths or text are invalid.
|
29 |
+
"""
|
30 |
+
pdf = FPDF()
|
31 |
+
pdf.add_page()
|
32 |
+
|
33 |
+
# Set font for the document
|
34 |
+
pdf.set_font("Arial", size=12)
|
35 |
+
|
36 |
+
# Add logo
|
37 |
+
try:
|
38 |
+
pdf.image(logo_path, x=10, y=8, w=30)
|
39 |
+
except FileNotFoundError:
|
40 |
+
raise FileNotFoundError(f"Logo file not found at: {logo_path}") # noqa: B904
|
41 |
+
|
42 |
+
# Add title
|
43 |
+
pdf.set_font("Arial", style="B", size=16)
|
44 |
+
pdf.cell(200, 10, txt="Rapport de conversation avec Dis-ADEME", ln=True, align="C")
|
45 |
+
|
46 |
+
# Add date
|
47 |
+
pdf.set_font("Arial", size=12)
|
48 |
+
creation_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
49 |
+
pdf.ln(10) # Add some space
|
50 |
+
pdf.cell(
|
51 |
+
200,
|
52 |
+
10,
|
53 |
+
txt=f"Date de création : {
|
54 |
+
creation_date}",
|
55 |
+
ln=True,
|
56 |
+
align="R",
|
57 |
+
)
|
58 |
+
|
59 |
+
# Add content
|
60 |
+
pdf.ln(20) # Add some space
|
61 |
+
pdf.set_font("Arial", size=12)
|
62 |
+
pdf.multi_cell(0, 10, txt=report_text)
|
63 |
+
|
64 |
+
# Save the PDF
|
65 |
+
try:
|
66 |
+
pdf.output(output_path)
|
67 |
+
logger.info(f"PDF report created successfully at: {output_path}")
|
68 |
+
except Exception as e: # noqa: BLE001
|
69 |
+
raise ValueError(f"Failed to save PDF: {e}") # noqa: B904
|
70 |
+
|
71 |
+
|
72 |
+
def extract_pdf_references(conversation: List[dict]) -> List[str]:
|
73 |
+
"""
|
74 |
+
Extract unique PDF references from the chatbot's responses in the conversation.
|
75 |
+
|
76 |
+
Args:
|
77 |
+
conversation (List[dict]): List of dictionaries representing the conversation.
|
78 |
+
Each dictionary contains 'role' ('user' or 'assistant')
|
79 |
+
and 'content' (message string).
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
List[str]: A list of unique PDF references mentioned in the chatbot's responses.
|
83 |
+
"""
|
84 |
+
pdf_references = set()
|
85 |
+
|
86 |
+
for message in conversation:
|
87 |
+
if (
|
88 |
+
message.get("role") == "assistant"
|
89 |
+
and "Consultez les documents suivants pour plus d'information:"
|
90 |
+
in message.get("content", "")
|
91 |
+
):
|
92 |
+
# Extract all PDF file names using regex
|
93 |
+
matches = re.findall(r"[\w\s-]+\.pdf", message["content"])
|
94 |
+
pdf_references.update(matches)
|
95 |
+
return sorted(pdf_references)
|
backend/app/internal/llm_chat.py
ADDED
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import uuid
|
2 |
+
from typing import Any, Callable, Dict, List
|
3 |
+
|
4 |
+
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
|
5 |
+
from langchain.chains.base import Chain
|
6 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
7 |
+
from langchain_community.chat_message_histories import ChatMessageHistory
|
8 |
+
from langchain_core.chat_history import BaseChatMessageHistory
|
9 |
+
from langchain_core.messages import AIMessage, HumanMessage
|
10 |
+
from langchain_core.language_models.chat_models import BaseChatModel
|
11 |
+
from langchain_core.prompts import ChatPromptTemplate
|
12 |
+
from langchain_core.runnables.history import RunnableWithMessageHistory
|
13 |
+
from langchain_openai import ChatOpenAI
|
14 |
+
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
|
15 |
+
|
16 |
+
from app.internal.export_report import extract_pdf_references
|
17 |
+
from app.internal.template_prompt import summary_system_prompt
|
18 |
+
from app.settings import settings
|
19 |
+
|
20 |
+
|
21 |
+
def get_chat_llm() -> BaseChatModel:
|
22 |
+
"""
|
23 |
+
Initializes and returns a ChatOpenAI instance configured with the provided settings.
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
ChatOpenAI: An instance of ChatOpenAI configured to use the specified model, API endpoint, and API key.
|
27 |
+
|
28 |
+
Raises:
|
29 |
+
ValueError: If any of the required settings (endpoint, API key, or model name) is missing.
|
30 |
+
"""
|
31 |
+
try:
|
32 |
+
if settings.provider == "hf_local":
|
33 |
+
pass
|
34 |
+
|
35 |
+
if settings.provider == "hf_api":
|
36 |
+
if not settings.hf_token:
|
37 |
+
raise ValueError("The HugginFace APIs token is not set.")
|
38 |
+
|
39 |
+
llm = HuggingFaceEndpoint(
|
40 |
+
repo_id=settings.llm_model_name,
|
41 |
+
task="text-generation",
|
42 |
+
max_new_tokens=settings.max_length,
|
43 |
+
do_sample=False,
|
44 |
+
repetition_penalty=1.03,
|
45 |
+
temperature=settings.temperature,
|
46 |
+
# huggingfacehub_api_token=settings.hf_token,
|
47 |
+
)
|
48 |
+
|
49 |
+
return ChatHuggingFace(llm=llm)
|
50 |
+
|
51 |
+
if settings.provider == "openai":
|
52 |
+
if not settings.scw_generative_apis_endpoint:
|
53 |
+
raise ValueError("The SCW Generative APIs endpoint is not set.")
|
54 |
+
if not settings.scw_api_key:
|
55 |
+
raise ValueError("The SCW API key is not set.")
|
56 |
+
if not settings.llm_model_name:
|
57 |
+
raise ValueError("The LLM model name is not set.")
|
58 |
+
|
59 |
+
return ChatOpenAI(
|
60 |
+
base_url=settings.scw_generative_apis_endpoint,
|
61 |
+
api_key=settings.scw_api_key,
|
62 |
+
model=settings.llm_model_name,
|
63 |
+
temperature=settings.temperature,
|
64 |
+
)
|
65 |
+
except Exception as e:
|
66 |
+
raise RuntimeError(f"Failed to initialize ChatOpenAI: {e}")
|
67 |
+
|
68 |
+
|
69 |
+
def get_history_retriever(llm, retriever, contextualize_q_prompt) -> object:
|
70 |
+
"""
|
71 |
+
Creates a history-aware retriever using the provided LLM, retriever, and contextualization prompt.
|
72 |
+
|
73 |
+
Args:
|
74 |
+
llm: The language model used for generating context-aware queries.
|
75 |
+
retriever: The retriever instance for querying a vector store or similar.
|
76 |
+
contextualize_q_prompt: A prompt template for contextualizing queries.
|
77 |
+
|
78 |
+
Returns:
|
79 |
+
object: A history-aware retriever instance.
|
80 |
+
|
81 |
+
Raises:
|
82 |
+
ValueError: If any of the required inputs are None or invalid.
|
83 |
+
"""
|
84 |
+
if not llm or not retriever or not contextualize_q_prompt:
|
85 |
+
raise ValueError(
|
86 |
+
"LLM, retriever, and contextualize_q_prompt must all be provided."
|
87 |
+
)
|
88 |
+
|
89 |
+
try:
|
90 |
+
return create_history_aware_retriever(llm, retriever, contextualize_q_prompt)
|
91 |
+
except Exception as e:
|
92 |
+
raise RuntimeError(f"Failed to create history-aware retriever: {e}")
|
93 |
+
|
94 |
+
|
95 |
+
def get_system_prompt_chain(llm, qa_prompt) -> object:
|
96 |
+
"""
|
97 |
+
Creates a prompt chain for processing system-level instructions with a question-answering prompt.
|
98 |
+
|
99 |
+
Args:
|
100 |
+
llm: The language model used for processing the system prompt.
|
101 |
+
qa_prompt: The prompt template for question-answering tasks.
|
102 |
+
|
103 |
+
Returns:
|
104 |
+
object: A chain instance for system prompt processing.
|
105 |
+
|
106 |
+
Raises:
|
107 |
+
ValueError: If either `llm` or `qa_prompt` is None.
|
108 |
+
"""
|
109 |
+
if not llm or not qa_prompt:
|
110 |
+
raise ValueError("LLM and qa_prompt must both be provided.")
|
111 |
+
|
112 |
+
try:
|
113 |
+
return create_stuff_documents_chain(llm, qa_prompt)
|
114 |
+
except Exception as e:
|
115 |
+
raise RuntimeError(f"Failed to create system prompt chain: {e}")
|
116 |
+
|
117 |
+
|
118 |
+
def get_rag_chain(history_aware_retriever, question_answer_chain) -> object:
|
119 |
+
"""
|
120 |
+
Creates a Retrieval-Augmented Generation (RAG) chain using a history-aware retriever and a Q&A chain.
|
121 |
+
|
122 |
+
Args:
|
123 |
+
history_aware_retriever: A retriever configured to incorporate conversation history into queries.
|
124 |
+
question_answer_chain: A chain for handling question-answering tasks.
|
125 |
+
|
126 |
+
Returns:
|
127 |
+
object: A RAG chain instance.
|
128 |
+
|
129 |
+
Raises:
|
130 |
+
ValueError: If either `history_aware_retriever` or `question_answer_chain` is None.
|
131 |
+
"""
|
132 |
+
if not history_aware_retriever or not question_answer_chain:
|
133 |
+
raise ValueError(
|
134 |
+
"Both history_aware_retriever and question_answer_chain must be provided."
|
135 |
+
)
|
136 |
+
|
137 |
+
try:
|
138 |
+
return create_retrieval_chain(history_aware_retriever, question_answer_chain)
|
139 |
+
except Exception as e:
|
140 |
+
raise RuntimeError(f"Failed to create RAG chain: {e}")
|
141 |
+
|
142 |
+
|
143 |
+
def get_session_history(session_id: str, history_store: dict) -> BaseChatMessageHistory:
|
144 |
+
"""
|
145 |
+
Retrieves or initializes the chat history for a given session ID.
|
146 |
+
|
147 |
+
Args:
|
148 |
+
session_id (str): The unique identifier for the session.
|
149 |
+
history_store (dict): A dictionary to store session histories.
|
150 |
+
|
151 |
+
Returns:
|
152 |
+
BaseChatMessageHistory: The chat message history for the session.
|
153 |
+
|
154 |
+
Raises:
|
155 |
+
ValueError: If `session_id` is not provided.
|
156 |
+
"""
|
157 |
+
if not session_id:
|
158 |
+
raise ValueError("A valid session_id must be provided.")
|
159 |
+
|
160 |
+
if session_id not in history_store:
|
161 |
+
history_store[session_id] = ChatMessageHistory()
|
162 |
+
|
163 |
+
return history_store[session_id]
|
164 |
+
|
165 |
+
|
166 |
+
def get_conversational_rag_chain(
|
167 |
+
rag_chain: Chain,
|
168 |
+
session_history_func: Callable[[str], BaseChatMessageHistory],
|
169 |
+
) -> RunnableWithMessageHistory:
|
170 |
+
"""
|
171 |
+
Creates a conversational Retrieval-Augmented Generation (RAG) chain with session history.
|
172 |
+
|
173 |
+
Args:
|
174 |
+
rag_chain (Chain): The RAG chain for handling retrieval and generation tasks.
|
175 |
+
session_history_func (Callable): A function to retrieve or initialize session history.
|
176 |
+
|
177 |
+
Returns:
|
178 |
+
RunnableWithMessageHistory: A chain that maintains message history and processes input/output.
|
179 |
+
|
180 |
+
Raises:
|
181 |
+
ValueError: If `rag_chain` or `session_history_func` is not provided.
|
182 |
+
"""
|
183 |
+
if not rag_chain:
|
184 |
+
raise ValueError("A valid rag_chain must be provided.")
|
185 |
+
if not session_history_func:
|
186 |
+
raise ValueError("A valid session history function must be provided.")
|
187 |
+
|
188 |
+
return RunnableWithMessageHistory(
|
189 |
+
rag_chain,
|
190 |
+
session_history_func,
|
191 |
+
input_messages_key="input",
|
192 |
+
history_messages_key="chat_history",
|
193 |
+
output_messages_key="answer",
|
194 |
+
)
|
195 |
+
|
196 |
+
|
197 |
+
def question_to_conversational_rag_chain(
|
198 |
+
user_query: str, conversational_rag_chain: Any, session_id: str = None
|
199 |
+
) -> Dict[str, Any]:
|
200 |
+
"""
|
201 |
+
Sends a user query to a conversational RAG chain and retrieves the response.
|
202 |
+
|
203 |
+
Args:
|
204 |
+
user_query (str): The query from the user.
|
205 |
+
conversational_rag_chain (Any): The conversational RAG chain instance.
|
206 |
+
session_id (str, optional): A unique identifier for the session. If not provided, a new session_id is generated.
|
207 |
+
|
208 |
+
Returns:
|
209 |
+
Dict[str, Any]: The response from the conversational RAG chain.
|
210 |
+
|
211 |
+
Raises:
|
212 |
+
ValueError: If the user query is empty or the RAG chain is not provided.
|
213 |
+
RuntimeError: If an error occurs during the invocation of the RAG chain.
|
214 |
+
"""
|
215 |
+
if not user_query:
|
216 |
+
raise ValueError("The user query must be a non-empty string.")
|
217 |
+
if not conversational_rag_chain:
|
218 |
+
raise ValueError("A valid conversational RAG chain must be provided.")
|
219 |
+
|
220 |
+
# Generate a session_id if none is provided
|
221 |
+
if not session_id:
|
222 |
+
session_id = str(uuid.uuid4())
|
223 |
+
|
224 |
+
try:
|
225 |
+
# Invoke the conversational RAG chain
|
226 |
+
return conversational_rag_chain.invoke(
|
227 |
+
{"input": user_query}, config={"configurable": {"session_id": session_id}}
|
228 |
+
)
|
229 |
+
except Exception as e:
|
230 |
+
raise RuntimeError(f"Failed to process the query with the RAG chain: {e}")
|
231 |
+
|
232 |
+
|
233 |
+
def get_documents_retrieve(output: Dict[str, Any], max_docs: int = 3) -> List[str]:
|
234 |
+
"""
|
235 |
+
Retrieves the titles of the documents from the output context.
|
236 |
+
|
237 |
+
Args:
|
238 |
+
output (Dict[str, Any]): The output containing context and metadata.
|
239 |
+
max_docs (int): The maximum number of document titles to retrieve. Default is 3.
|
240 |
+
|
241 |
+
Returns:
|
242 |
+
List[str]: A list of document titles.
|
243 |
+
|
244 |
+
Raises:
|
245 |
+
ValueError: If the 'context' key is missing or empty in the output.
|
246 |
+
"""
|
247 |
+
if "context" not in output:
|
248 |
+
return None
|
249 |
+
|
250 |
+
return [
|
251 |
+
output["context"][i].metadata.get("Title", "Untitled Document")
|
252 |
+
for i in range(min(len(output["context"]), max_docs))
|
253 |
+
] # TODO add filtre sur le type de documents à retourner
|
254 |
+
|
255 |
+
|
256 |
+
def get_llm_answer(output: Dict[str, Any]) -> str:
|
257 |
+
"""
|
258 |
+
Extracts the answer generated by the LLM from the output.
|
259 |
+
|
260 |
+
Args:
|
261 |
+
output (Dict[str, Any]): The output containing the answer.
|
262 |
+
|
263 |
+
Returns:
|
264 |
+
str: The LLM-generated answer.
|
265 |
+
|
266 |
+
Raises:
|
267 |
+
ValueError: If the 'answer' key is missing or empty in the output.
|
268 |
+
"""
|
269 |
+
if "answer" not in output or not output["answer"]:
|
270 |
+
raise ValueError("The output does not contain a valid 'answer'.")
|
271 |
+
|
272 |
+
return output["answer"]
|
273 |
+
|
274 |
+
|
275 |
+
def get_format_output(answer: str, context: List[str]) -> str:
|
276 |
+
"""
|
277 |
+
Formats the LLM answer with a list of related document titles.
|
278 |
+
|
279 |
+
Args:
|
280 |
+
answer (str): The LLM-generated answer.
|
281 |
+
context (List[str]): A list of document titles related to the answer.
|
282 |
+
|
283 |
+
Returns:
|
284 |
+
str: A formatted string containing the answer and document references.
|
285 |
+
|
286 |
+
Raises:
|
287 |
+
ValueError: If the answer is empty or None.
|
288 |
+
"""
|
289 |
+
if not answer:
|
290 |
+
raise ValueError("The 'answer' must be a non-empty string.")
|
291 |
+
|
292 |
+
formatted_output = f"{answer}"
|
293 |
+
if context:
|
294 |
+
uniques_doc = set(context)
|
295 |
+
formatted_output += (
|
296 |
+
"\n\nConsultez les documents suivants pour plus d'information:\n\n"
|
297 |
+
)
|
298 |
+
formatted_output += "\n\n".join(uniques_doc)
|
299 |
+
|
300 |
+
return formatted_output
|
301 |
+
|
302 |
+
|
303 |
+
def clean_output(answer): # TODO add clean process for output
|
304 |
+
pass
|
305 |
+
|
306 |
+
|
307 |
+
def generate_summary(llm, conversation: List[dict]) -> str:
|
308 |
+
"""
|
309 |
+
Generate a summary of the conversation with LangChain and append PDF references at the end.
|
310 |
+
|
311 |
+
Args:
|
312 |
+
conversation (List[dict]): List of dictionaries representing the conversation.
|
313 |
+
Each dictionary contains 'role' ('user' or 'assistant')
|
314 |
+
and 'content' (message string).
|
315 |
+
llm (str): OpenAI model to use.
|
316 |
+
|
317 |
+
Returns:
|
318 |
+
str: The generated summary with PDF references appended.
|
319 |
+
"""
|
320 |
+
# Extract unique PDF references
|
321 |
+
pdf_references = extract_pdf_references(conversation)
|
322 |
+
|
323 |
+
# Prepare the messages
|
324 |
+
messages = summary_system_prompt
|
325 |
+
|
326 |
+
for message in conversation:
|
327 |
+
if message["role"] == "user":
|
328 |
+
messages.append(HumanMessage(content=message["content"]))
|
329 |
+
|
330 |
+
elif message["role"] == "assistant":
|
331 |
+
messages.append(AIMessage(content=message["content"]))
|
332 |
+
|
333 |
+
# Generate the summary
|
334 |
+
summary_prompt = ChatPromptTemplate.from_messages(messages).format()
|
335 |
+
|
336 |
+
summary = llm.invoke(summary_prompt)
|
337 |
+
|
338 |
+
# Append the PDF references
|
339 |
+
summary_text = summary.content
|
340 |
+
if pdf_references:
|
341 |
+
summary_text += (
|
342 |
+
"\n\nDocuments pdf à consulter pour plus d'information:"
|
343 |
+
+ "\n".join(sorted(pdf_references))
|
344 |
+
)
|
345 |
+
|
346 |
+
return summary_text
|
backend/app/internal/parser.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Iterator, List, Union
|
3 |
+
|
4 |
+
import openparse
|
5 |
+
from docling.document_converter import DocumentConverter
|
6 |
+
from langchain_core.document_loaders import BaseLoader
|
7 |
+
from langchain_core.documents import Document as LCDocument
|
8 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
9 |
+
|
10 |
+
from app.settings import Settings
|
11 |
+
|
12 |
+
|
13 |
+
def get_pdf_paths(directory_or_file: Union[str, os.PathLike]) -> List[str]:
|
14 |
+
"""
|
15 |
+
Retrieve all PDF file paths from a given directory, including its subdirectories, or from a single file.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
directory_or_file (Union[str, os.PathLike]): Path to a directory or a single file.
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
List[str]: A list of file paths to PDF files.
|
22 |
+
|
23 |
+
Raises:
|
24 |
+
FileNotFoundError: If the given path does not exist.
|
25 |
+
ValueError: If the input path is neither a directory nor a PDF file.
|
26 |
+
"""
|
27 |
+
if not os.path.exists(directory_or_file):
|
28 |
+
raise FileNotFoundError(f"The path '{directory_or_file}' does not exist.")
|
29 |
+
|
30 |
+
pdf_paths = []
|
31 |
+
|
32 |
+
if os.path.isdir(directory_or_file):
|
33 |
+
for root, _, files in os.walk(directory_or_file):
|
34 |
+
for file in files:
|
35 |
+
if file.lower().endswith(".pdf"):
|
36 |
+
pdf_paths.append(os.path.join(root, file))
|
37 |
+
|
38 |
+
elif os.path.isfile(directory_or_file):
|
39 |
+
if directory_or_file.lower().endswith(".pdf"):
|
40 |
+
pdf_paths.append(directory_or_file)
|
41 |
+
else:
|
42 |
+
raise ValueError(f"The file '{directory_or_file}' is not a PDF.")
|
43 |
+
else:
|
44 |
+
raise ValueError(
|
45 |
+
f"The path '{directory_or_file}' is neither a directory nor a valid file."
|
46 |
+
)
|
47 |
+
|
48 |
+
return pdf_paths
|
49 |
+
|
50 |
+
|
51 |
+
settings = Settings()
|
52 |
+
|
53 |
+
|
54 |
+
def parse_document(doc_path, parser=settings.parser):
|
55 |
+
if parser == "openparse":
|
56 |
+
parser = openparse.DocumentParser()
|
57 |
+
parsed_basic_doc = parser.parse(doc_path)
|
58 |
+
|
59 |
+
parsed_doc = [
|
60 |
+
node.text.replace("<br><br>", "\n") for node in parsed_basic_doc.nodes
|
61 |
+
]
|
62 |
+
|
63 |
+
if parser == "docling": # FIXME
|
64 |
+
converter = DocumentConverter()
|
65 |
+
parsed_doc = converter.convert(doc_path)
|
66 |
+
|
67 |
+
# loader = DoclingPDFLoader(file_path=doc_path)
|
68 |
+
# parsed_doc = loader.load()
|
69 |
+
|
70 |
+
return parsed_doc
|
71 |
+
|
72 |
+
|
73 |
+
def split_documents(text_splitter, docs):
|
74 |
+
return text_splitter.split_documents(docs)
|
75 |
+
|
76 |
+
|
77 |
+
def get_text_chunker():
|
78 |
+
return RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
79 |
+
|
80 |
+
|
81 |
+
# class DoclingPDFLoader(BaseLoader):
|
82 |
+
|
83 |
+
# def __init__(self, file_path: str | list[str]) -> None:
|
84 |
+
# self._file_paths = file_path if isinstance(
|
85 |
+
# file_path, list) else [file_path]
|
86 |
+
# self._converter = DocumentConverter()
|
87 |
+
|
88 |
+
# def lazy_load(self) -> Iterator[LCDocument]:
|
89 |
+
# for source in self._file_paths:
|
90 |
+
# dl_doc = self._converter.convert(source).document
|
91 |
+
# text = dl_doc.export_to_markdown()
|
92 |
+
# yield LCDocument(page_content=text)
|
93 |
+
|
94 |
+
|
95 |
+
# loader = DoclingPDFLoader(file_path=path)
|
96 |
+
# text_splitter = RecursiveCharacterTextSplitter(
|
97 |
+
# chunk_size=1000,
|
98 |
+
# chunk_overlap=200,
|
99 |
+
# )
|
100 |
+
|
101 |
+
# docs = loader.load()
|
102 |
+
# splits = text_splitter.split_documents(docs)
|
103 |
+
|
104 |
+
# splits
|
backend/app/internal/template_prompt.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_core.messages import SystemMessage # noqa: D100
|
2 |
+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
3 |
+
|
4 |
+
### Contextualize question ###
|
5 |
+
contextualize_q_system_prompt = """Based on the provided chat history and the most
|
6 |
+
recent user question, your task is to reformulate the latest question
|
7 |
+
into a fully standalone version.
|
8 |
+
|
9 |
+
Ensure the reformulated question is clear, self-contained, and does not rely
|
10 |
+
on any prior context from the chat history to be understood.
|
11 |
+
If the latest question already functions as a standalone question,
|
12 |
+
return it unchanged.
|
13 |
+
Do NOT provide an answer to the question or interpret the user’s intent
|
14 |
+
beyond making the question self-contained.
|
15 |
+
Retain all technical details, key terms, and precision from the original
|
16 |
+
question in your reformulation.
|
17 |
+
Your sole output should be the reformulated standalone question,
|
18 |
+
or the original question if no reformulation is required."""
|
19 |
+
|
20 |
+
contextualize_q_prompt = ChatPromptTemplate.from_messages(
|
21 |
+
[
|
22 |
+
("system", contextualize_q_system_prompt),
|
23 |
+
MessagesPlaceholder("chat_history"),
|
24 |
+
("human", "{input}"),
|
25 |
+
]
|
26 |
+
)
|
27 |
+
|
28 |
+
|
29 |
+
### Answer question ###
|
30 |
+
system_prompt = """You are an intelligent and professional assistant named 'Dis-ADEME',
|
31 |
+
created by the ADEME organization to assist with question-answering tasks related
|
32 |
+
to ecological transition, sustainable practices, and technical inquiries.
|
33 |
+
|
34 |
+
Use the provided retrieved context to answer the user's question accurately
|
35 |
+
and concisely.
|
36 |
+
If the retrieved context does not contain the necessary information,
|
37 |
+
explicitly state:
|
38 |
+
"Je suis désolé, je ne dispose pas des informations nécessaires
|
39 |
+
pour répondre à cette question."
|
40 |
+
Limit your response to a maximum of three sentences while maintaining clarity
|
41 |
+
and relevance. Ensure that your tone is formal and professional,
|
42 |
+
as your responses are intended for official use.
|
43 |
+
Do not speculate or provide information that is not explicitly supported
|
44 |
+
by the retrieved context.
|
45 |
+
Context:
|
46 |
+
{context}"""
|
47 |
+
|
48 |
+
qa_prompt = ChatPromptTemplate.from_messages(
|
49 |
+
[
|
50 |
+
("system", system_prompt),
|
51 |
+
MessagesPlaceholder("chat_history"),
|
52 |
+
("human", "{input}"),
|
53 |
+
]
|
54 |
+
)
|
55 |
+
|
56 |
+
### Conversation summary ###
|
57 |
+
summary_report_system_prompt = """
|
58 |
+
You are a knowledgeable and professional French assistant named 'Dis-ADEME',
|
59 |
+
created by the ADEME organization.
|
60 |
+
Your task is to summarize in French the following conversation between a user and
|
61 |
+
an assistant, providing a structured, comprehensive, and detailed summary.
|
62 |
+
|
63 |
+
Focus exclusively on the content and technical details discussed in the conversation,
|
64 |
+
omitting any reference to the roles of the participants
|
65 |
+
(e.g., "user" or "assistant").
|
66 |
+
Present the information in clear, concise, and professional language,
|
67 |
+
suitable for inclusion in an official administrative report.
|
68 |
+
Emphasize critical technical details, key points of discussion,
|
69 |
+
and any actionable insights or conclusions derived from the conversation.
|
70 |
+
Organize the summary into sections or paragraphs if appropriate,
|
71 |
+
ensuring clarity and logical flow.
|
72 |
+
If the conversation references external documents or resources (e.g., PDFs),
|
73 |
+
include their titles or descriptions in a dedicated section at the end of the summary.
|
74 |
+
Do not include any conversational or informal elements; maintain
|
75 |
+
a formal and neutral tone throughout.
|
76 |
+
Output your response as a structured report in French, ready for official use.
|
77 |
+
"""
|
78 |
+
|
79 |
+
summary_system_prompt = [SystemMessage(content=summary_report_system_prompt)]
|
backend/app/main.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Main module."""
|
2 |
+
|
3 |
+
import logging
|
4 |
+
from typing import Any
|
5 |
+
import uvicorn
|
6 |
+
from fastapi import FastAPI
|
7 |
+
|
8 |
+
from app.routers.chatting import chat_router
|
9 |
+
from app.routers.embedding import embedding_router
|
10 |
+
|
11 |
+
|
12 |
+
logging.basicConfig(
|
13 |
+
level=logging.INFO,
|
14 |
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
15 |
+
)
|
16 |
+
|
17 |
+
logger = logging.getLogger(__name__)
|
18 |
+
|
19 |
+
app = FastAPI()
|
20 |
+
|
21 |
+
app.include_router(embedding_router)
|
22 |
+
app.include_router(chat_router)
|
23 |
+
|
24 |
+
|
25 |
+
@app.get("/")
|
26 |
+
async def root() -> Any: # noqa: ANN401
|
27 |
+
"""Return greetings."""
|
28 |
+
return {"message": "Hello ADEME!"}
|
29 |
+
|
30 |
+
|
31 |
+
if __name__ == "__main__":
|
32 |
+
uvicorn.run(app, log_level="info")
|
backend/app/resources/logo_ademe.png
ADDED
![]() |
backend/app/routers/__init__.py
ADDED
File without changes
|
backend/app/routers/chatting.py
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from typing import Any, Dict, List
|
3 |
+
|
4 |
+
from fastapi import APIRouter, HTTPException
|
5 |
+
from pydantic import BaseModel
|
6 |
+
|
7 |
+
from app.internal.bdd_manager import (
|
8 |
+
create_collection,
|
9 |
+
get_ensemble_retriever,
|
10 |
+
get_retriever,
|
11 |
+
get_vector_store,
|
12 |
+
)
|
13 |
+
from app.internal.embedder import get_embedder
|
14 |
+
from app.internal.export_report import create_pdf_report
|
15 |
+
from app.internal.llm_chat import (
|
16 |
+
generate_summary,
|
17 |
+
get_chat_llm,
|
18 |
+
get_conversational_rag_chain,
|
19 |
+
get_documents_retrieve,
|
20 |
+
get_format_output,
|
21 |
+
get_history_retriever,
|
22 |
+
get_llm_answer,
|
23 |
+
get_rag_chain,
|
24 |
+
get_session_history,
|
25 |
+
get_system_prompt_chain,
|
26 |
+
question_to_conversational_rag_chain,
|
27 |
+
)
|
28 |
+
from app.internal.template_prompt import contextualize_q_prompt, qa_prompt
|
29 |
+
from app.settings import settings
|
30 |
+
|
31 |
+
# Initialisation du logger
|
32 |
+
logging.basicConfig(level=logging.INFO)
|
33 |
+
logger = logging.getLogger(__name__)
|
34 |
+
|
35 |
+
chat_router = APIRouter(
|
36 |
+
prefix="/chatting",
|
37 |
+
tags=["question_anwser"],
|
38 |
+
responses={404: {"description": "Not found"}},
|
39 |
+
)
|
40 |
+
|
41 |
+
|
42 |
+
class QueryRequest(BaseModel):
|
43 |
+
user_query: str
|
44 |
+
session_id: str = settings.session_id
|
45 |
+
|
46 |
+
|
47 |
+
class ResponseOutput(BaseModel):
|
48 |
+
answer: str
|
49 |
+
context: List[str]
|
50 |
+
formatted_output: str
|
51 |
+
|
52 |
+
|
53 |
+
class Conversation(BaseModel):
|
54 |
+
messages: List[Any]
|
55 |
+
|
56 |
+
|
57 |
+
class ResponseOutputSum(BaseModel):
|
58 |
+
summary: str
|
59 |
+
|
60 |
+
|
61 |
+
# Initialisation des ressources
|
62 |
+
user_collection_name = settings.user_collection_name
|
63 |
+
logger.info("Initializing collection: %s", user_collection_name)
|
64 |
+
create_collection(user_collection_name)
|
65 |
+
|
66 |
+
doc_collection_name = settings.doc_collection_name
|
67 |
+
logger.info("Initializing collection: %s", doc_collection_name)
|
68 |
+
create_collection(doc_collection_name)
|
69 |
+
|
70 |
+
embedder = get_embedder(provider=settings.provider)
|
71 |
+
logger.info("Embedder initialized.")
|
72 |
+
|
73 |
+
doc_vector_store = get_vector_store(embedder, doc_collection_name)
|
74 |
+
logger.info("Vector store initialized with collection: %s", doc_collection_name)
|
75 |
+
|
76 |
+
user_vector_store = get_vector_store(embedder, user_collection_name)
|
77 |
+
logger.info("Vector store initialized with collection: %s", user_collection_name)
|
78 |
+
|
79 |
+
logger.info("Initializing LLM and retrievers...")
|
80 |
+
llm = get_chat_llm()
|
81 |
+
user_retriever = get_retriever(user_vector_store)
|
82 |
+
doc_retriever = get_retriever(doc_vector_store)
|
83 |
+
retriever = get_ensemble_retriever(doc_retriever, user_retriever)
|
84 |
+
|
85 |
+
logger.info("Creating history-aware retriever...")
|
86 |
+
history_retriever = get_history_retriever(llm, retriever, contextualize_q_prompt)
|
87 |
+
|
88 |
+
logger.info("Creating system prompt chain...")
|
89 |
+
qa_chain = get_system_prompt_chain(llm, qa_prompt)
|
90 |
+
|
91 |
+
logger.info("Creating RAG chain...")
|
92 |
+
rag_chain = get_rag_chain(history_retriever, qa_chain)
|
93 |
+
|
94 |
+
logger.info("Initializing conversational RAG chain...")
|
95 |
+
conversational_chain = get_conversational_rag_chain(
|
96 |
+
rag_chain,
|
97 |
+
lambda sid: get_session_history(settings.session_id, settings.history_store),
|
98 |
+
)
|
99 |
+
|
100 |
+
|
101 |
+
@chat_router.post("/chat", response_model=ResponseOutput)
|
102 |
+
async def chat_with_rag_chain(request: QueryRequest):
|
103 |
+
"""
|
104 |
+
Route pour interagir avec le RAG (Retrieval-Augmented Generation) Chain.
|
105 |
+
"""
|
106 |
+
logger.info("Received chat request with session_id: %s", request.session_id)
|
107 |
+
logger.info("User query: %s", request.user_query)
|
108 |
+
|
109 |
+
try:
|
110 |
+
logger.info("Processing user query...")
|
111 |
+
response = question_to_conversational_rag_chain(
|
112 |
+
request.user_query, conversational_chain, request.session_id
|
113 |
+
)
|
114 |
+
logger.info("LLM response received: %s", response)
|
115 |
+
|
116 |
+
answer = get_llm_answer(response)
|
117 |
+
documents = get_documents_retrieve(response)
|
118 |
+
|
119 |
+
logger.info("Formatting output...")
|
120 |
+
formatted_output = get_format_output(answer, documents)
|
121 |
+
|
122 |
+
logger.info(
|
123 |
+
"Successfully processed chat request for session_id: %s", request.session_id
|
124 |
+
)
|
125 |
+
return {
|
126 |
+
"answer": answer,
|
127 |
+
"context": documents,
|
128 |
+
"formatted_output": formatted_output,
|
129 |
+
}
|
130 |
+
|
131 |
+
except ValueError as e:
|
132 |
+
logger.error("Validation error: %s", str(e))
|
133 |
+
raise HTTPException(status_code=400, detail=str(e))
|
134 |
+
except Exception as e:
|
135 |
+
logger.error("Internal server error: %s", str(e))
|
136 |
+
raise HTTPException(status_code=500, detail=f"Internal server error: {e}")
|
137 |
+
|
138 |
+
|
139 |
+
@chat_router.get("/history/{session_id}")
|
140 |
+
async def get_chat_history(session_id: str):
|
141 |
+
"""
|
142 |
+
Route pour récupérer l'historique des messages pour une session donnée.
|
143 |
+
"""
|
144 |
+
logger.info("Fetching chat history for session_id: %s", session_id)
|
145 |
+
|
146 |
+
try:
|
147 |
+
history = get_session_history(session_id, settings.history_store)
|
148 |
+
logger.info(
|
149 |
+
"Successfully retrieved chat history for session_id: %s", session_id
|
150 |
+
)
|
151 |
+
return {"session_id": session_id, "history": history.messages}
|
152 |
+
except ValueError as e:
|
153 |
+
logger.error("Validation error: %s", str(e))
|
154 |
+
raise HTTPException(status_code=400, detail=str(e))
|
155 |
+
except Exception as e:
|
156 |
+
logger.error("Internal server error while fetching history: %s", str(e))
|
157 |
+
raise HTTPException(status_code=500, detail=f"Internal server error: {e}")
|
158 |
+
|
159 |
+
|
160 |
+
@chat_router.post("/chat", response_model=ResponseOutput)
|
161 |
+
async def chat_with_rag_chain(request: QueryRequest):
|
162 |
+
"""
|
163 |
+
Route pour interagir avec le RAG (Retrieval-Augmented Generation) Chain.
|
164 |
+
"""
|
165 |
+
logger.info("Received chat request with session_id: %s", request.session_id)
|
166 |
+
logger.info("User query: %s", request.user_query)
|
167 |
+
|
168 |
+
try:
|
169 |
+
logger.info("Processing user query...")
|
170 |
+
response = question_to_conversational_rag_chain(
|
171 |
+
request.user_query, conversational_chain, request.session_id
|
172 |
+
)
|
173 |
+
|
174 |
+
answer = get_llm_answer(response)
|
175 |
+
documents = get_documents_retrieve(response)
|
176 |
+
|
177 |
+
logger.info("Formatting output...")
|
178 |
+
formatted_output = get_format_output(answer, documents)
|
179 |
+
|
180 |
+
logger.info(
|
181 |
+
"Successfully processed chat request for session_id: %s", request.session_id
|
182 |
+
)
|
183 |
+
return {
|
184 |
+
"answer": answer,
|
185 |
+
"context": documents,
|
186 |
+
"formatted_output": formatted_output,
|
187 |
+
}
|
188 |
+
|
189 |
+
except ValueError as e:
|
190 |
+
logger.error("Validation error: %s", str(e))
|
191 |
+
raise HTTPException(status_code=400, detail=str(e))
|
192 |
+
except Exception as e:
|
193 |
+
logger.error("Internal server error: %s", str(e))
|
194 |
+
raise HTTPException(status_code=500, detail=f"Internal server error: {e}")
|
195 |
+
|
196 |
+
|
197 |
+
@chat_router.post("/summary", response_model=ResponseOutputSum)
|
198 |
+
async def summarize_conversation(conversation: Conversation):
|
199 |
+
"""
|
200 |
+
Génère un résumé de la conversation et liste les documents PDF référencés.
|
201 |
+
|
202 |
+
Args:
|
203 |
+
conversation (Conversation): Objet contenant les messages de la conversation.
|
204 |
+
|
205 |
+
Returns:
|
206 |
+
dict: Résumé de la conversation et liste des documents PDF référencés.
|
207 |
+
"""
|
208 |
+
outpur_path = r"..\Shared_data\export.pdf"
|
209 |
+
# outpur_path = r"C:\Users\jeanb\Documents\kzs-team\Shared_data\export.pdf"
|
210 |
+
logo_path = r"app\resources\logo_ademe.png"
|
211 |
+
summary_text = generate_summary(llm, conversation.messages)
|
212 |
+
|
213 |
+
create_pdf_report(outpur_path, logo_path, summary_text)
|
214 |
+
|
215 |
+
return {"summary": summary_text}
|
backend/app/routers/embedding.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Embedding tools"""
|
2 |
+
|
3 |
+
import logging
|
4 |
+
from typing import List
|
5 |
+
from fastapi import APIRouter, HTTPException
|
6 |
+
from pydantic import BaseModel
|
7 |
+
|
8 |
+
from app.internal.bdd_manager import create_collection, get_vector_store
|
9 |
+
from app.internal.embedder import get_embedder
|
10 |
+
from app.internal.parser import get_pdf_paths, get_text_chunker, parse_document
|
11 |
+
from app.settings import settings
|
12 |
+
|
13 |
+
|
14 |
+
logging.basicConfig(level=logging.INFO)
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
|
17 |
+
embedding_router = APIRouter(
|
18 |
+
prefix="/embeddings",
|
19 |
+
tags=["documents"],
|
20 |
+
responses={404: {"description": "Not found"}},
|
21 |
+
)
|
22 |
+
|
23 |
+
user_collection_name = settings.user_collection_name
|
24 |
+
logger.info("Initializing collection: %s", user_collection_name)
|
25 |
+
create_collection(user_collection_name)
|
26 |
+
|
27 |
+
doc_collection_name = settings.doc_collection_name
|
28 |
+
logger.info("Initializing collection: %s", doc_collection_name)
|
29 |
+
create_collection(doc_collection_name)
|
30 |
+
|
31 |
+
embedder = get_embedder(provider=settings.provider)
|
32 |
+
logger.info("Embedder initialized.")
|
33 |
+
|
34 |
+
doc_vector_store = get_vector_store(embedder, doc_collection_name)
|
35 |
+
logger.info("Vector store initialized with collection: %s", doc_collection_name)
|
36 |
+
|
37 |
+
user_vector_store = get_vector_store(embedder, user_collection_name)
|
38 |
+
logger.info("Vector store initialized with collection: %s", user_collection_name)
|
39 |
+
|
40 |
+
text_splitter = get_text_chunker()
|
41 |
+
logger.info("Text splitter initialized.")
|
42 |
+
|
43 |
+
|
44 |
+
def get_vectorstore(vectorstor_type):
|
45 |
+
if vectorstor_type == "user":
|
46 |
+
return user_vector_store
|
47 |
+
|
48 |
+
if vectorstor_type == "doc":
|
49 |
+
return doc_vector_store
|
50 |
+
return None
|
51 |
+
|
52 |
+
|
53 |
+
class DocPathsInput(BaseModel): # TODO move to schema.py
|
54 |
+
doc_paths: str
|
55 |
+
vectorstor_type: str
|
56 |
+
|
57 |
+
|
58 |
+
@embedding_router.post("/embedded/")
|
59 |
+
async def embedding(doc_paths_input: DocPathsInput):
|
60 |
+
"""
|
61 |
+
Embeds documents provided via file paths and adds them to the vector store.
|
62 |
+
|
63 |
+
Args:
|
64 |
+
doc_paths_input (DocPathsInput): A Pydantic model containing
|
65 |
+
a list of document file paths.
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
dict: A response containing the number of documents added to the vector store.
|
69 |
+
|
70 |
+
Raises:
|
71 |
+
HTTPException: If the document parsing or embedding process fails.
|
72 |
+
"""
|
73 |
+
|
74 |
+
logger.info("Received request to embed documents: %s", doc_paths_input.doc_paths)
|
75 |
+
vector_store = get_vectorstore(doc_paths_input.vectorstor_type)
|
76 |
+
|
77 |
+
try:
|
78 |
+
folder_path = doc_paths_input.doc_paths
|
79 |
+
logger.info(folder_path)
|
80 |
+
doc_paths = get_pdf_paths(folder_path)
|
81 |
+
logger.info(doc_paths)
|
82 |
+
for path in doc_paths:
|
83 |
+
try:
|
84 |
+
logger.info("Parsing document at path: %s", path)
|
85 |
+
parsed_documents = parse_document(path)
|
86 |
+
doc_title = path.split("\\")[-1]
|
87 |
+
logger.info("Document parsed: %s", doc_title)
|
88 |
+
|
89 |
+
documents = text_splitter.create_documents(
|
90 |
+
parsed_documents,
|
91 |
+
metadatas=[
|
92 |
+
{"Title": doc_title} for _ in range(len(parsed_documents))
|
93 |
+
],
|
94 |
+
)
|
95 |
+
logger.info(
|
96 |
+
"Created %d document chunks for: %s", len(documents), doc_title
|
97 |
+
)
|
98 |
+
|
99 |
+
vector_store.add_documents(documents)
|
100 |
+
|
101 |
+
logger.info("Documents added to vector store: %s", doc_title)
|
102 |
+
|
103 |
+
except Exception as e:
|
104 |
+
logger.info(
|
105 |
+
f"An error occured during the parsing of the file {path}: {e}"
|
106 |
+
)
|
107 |
+
|
108 |
+
logger.info("All documents successfully processed and embedded.")
|
109 |
+
return {
|
110 |
+
"message": "Documents successfully embedded and stored",
|
111 |
+
"documents_added": len(doc_paths),
|
112 |
+
}
|
113 |
+
|
114 |
+
except Exception as e:
|
115 |
+
logger.error("An error occurred during the embedding process: %s", e)
|
116 |
+
raise HTTPException(status_code=500, detail=f"An error occurred: {e!s}")
|
117 |
+
|
118 |
+
|
119 |
+
class SearchQuery(BaseModel): # TODO move to schema.py
|
120 |
+
vectorstor_type: str
|
121 |
+
query: str
|
122 |
+
k: int = 2
|
123 |
+
|
124 |
+
|
125 |
+
@embedding_router.post("/similarity_search/")
|
126 |
+
async def search_documents(search_query: SearchQuery):
|
127 |
+
"""
|
128 |
+
Search for documents in the vector store based on a query.
|
129 |
+
|
130 |
+
Args:
|
131 |
+
search_query (SearchQuery): A Pydantic model containing the query string and the number of results (k).
|
132 |
+
|
133 |
+
Returns:
|
134 |
+
List[dict]: A list of documents matching the query, including their content and metadata.
|
135 |
+
|
136 |
+
Raises:
|
137 |
+
HTTPException: If the search process fails or no documents are found.
|
138 |
+
"""
|
139 |
+
logger.info("Received similarity search query: %s", search_query.query)
|
140 |
+
|
141 |
+
vector_store = get_vectorstore(search_query.vectorstor_type)
|
142 |
+
|
143 |
+
try:
|
144 |
+
found_docs = vector_store.similarity_search(
|
145 |
+
search_query.query, k=search_query.k
|
146 |
+
)
|
147 |
+
logger.info(
|
148 |
+
"Found %d documents for query: %s", len(found_docs), search_query.query
|
149 |
+
)
|
150 |
+
|
151 |
+
if not found_docs:
|
152 |
+
logger.warning("No documents found for query: %s", search_query.query)
|
153 |
+
raise HTTPException(
|
154 |
+
status_code=404, detail="No documents found for the given query."
|
155 |
+
)
|
156 |
+
|
157 |
+
logger.info("Returning results for query: %s", search_query.query)
|
158 |
+
return [
|
159 |
+
{
|
160 |
+
"content": doc.page_content,
|
161 |
+
"metadata": doc.metadata if hasattr(doc, "metadata") else None,
|
162 |
+
}
|
163 |
+
for doc in found_docs
|
164 |
+
]
|
165 |
+
except Exception as e:
|
166 |
+
logger.error("An error occurred during the similarity search: %s", e)
|
167 |
+
raise HTTPException(
|
168 |
+
status_code=500, detail=f"An error occurred during the search: {e}"
|
169 |
+
)
|
backend/app/settings.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
3 |
+
|
4 |
+
|
5 |
+
class Settings(BaseSettings):
|
6 |
+
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
|
7 |
+
llm_model_name: str = "HuggingFaceH4/zephyr-7b-beta"
|
8 |
+
context_window_size: int = 5
|
9 |
+
retrieval_top_k: int = 3
|
10 |
+
temperature: float = 0.2
|
11 |
+
max_length: int = 2048
|
12 |
+
hf_token: str = os.getenv("HF_TOKEN")
|
13 |
+
|
14 |
+
if not hf_token:
|
15 |
+
raise ValueError(
|
16 |
+
"ERREUR : Le token Hugging Face (HF_TOKEN) n'est pas défini ! Ajoute-le dans les variables d'environnement Hugging Face Spaces."
|
17 |
+
)
|
18 |
+
|
19 |
+
embedding_model_name: str = "sentence-transformers/sentence-t5-xxl"
|
20 |
+
# qdrant_url: str = "http://qdrant:6333"
|
21 |
+
qdrant_url: str = "http://localhost:6333"
|
22 |
+
parser: str = "openparse"
|
23 |
+
history_store: dict = {}
|
24 |
+
session_id: str = "user012025"
|
25 |
+
user_collection_name: str = "User_Ademe_collection"
|
26 |
+
doc_collection_name: str = "Doc_Ademe_collection"
|
27 |
+
provider: str = "hf_api"
|
28 |
+
|
29 |
+
|
30 |
+
settings = Settings()
|
backend/test/test_main.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Basic test."""
|
2 |
+
|
3 |
+
|
4 |
+
def test_basic() -> None:
|
5 |
+
"""Test that 1 + 1 equals 2."""
|
6 |
+
assert 1 + 1 == 2
|
dockerignore
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dist/
|
2 |
+
env/
|
3 |
+
presentation/
|
4 |
+
__pycache__/
|
5 |
+
.coverage/
|
6 |
+
.doit.db
|
7 |
+
.git/
|
8 |
+
.gitignore
|
9 |
+
.idea/
|
10 |
+
.mypy_cache/
|
11 |
+
.pytest_cache/
|
12 |
+
.ruff_cache/
|
13 |
+
.venv/
|
14 |
+
.gitlab-ci.yml
|
15 |
+
renovate.json
|
16 |
+
Dockerfile
|
17 |
+
dodo.py
|
frontend/app/__init__.py
ADDED
File without changes
|
frontend/app/main.py
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import base64
|
3 |
+
from pathlib import Path
|
4 |
+
import streamlit as st
|
5 |
+
import requests
|
6 |
+
from settings import settings
|
7 |
+
|
8 |
+
|
9 |
+
BASE_DIR = str(Path(__file__).resolve().parent)
|
10 |
+
# API_URL_CHAT = "http://localhost:8088/chatting/chat"
|
11 |
+
# API_URL_EMBEDDING = "http://localhost:8088/embeddings/embedded"
|
12 |
+
# API_URL_SUM = "http://localhost:8088/chatting/summary"
|
13 |
+
API_URL_CHAT = "http://localhost/api/chatting/chat"
|
14 |
+
API_URL_EMBEDDING = "http://localhost/api/embeddings/embedded"
|
15 |
+
API_URL_SUM = "http://localhost/api/chatting/summary"
|
16 |
+
|
17 |
+
st.set_page_config(
|
18 |
+
page_title="CV_JBDENIS",
|
19 |
+
page_icon="🧊",
|
20 |
+
)
|
21 |
+
|
22 |
+
# Helper functions for background
|
23 |
+
|
24 |
+
|
25 |
+
def get_base64_of_bin_file(bin_file): # noqa: ANN001, ANN201, D103
|
26 |
+
with open(bin_file, "rb") as f:
|
27 |
+
data = f.read()
|
28 |
+
return base64.b64encode(data).decode()
|
29 |
+
|
30 |
+
|
31 |
+
def set_png_as_page_bg(png_file) -> None: # noqa: ANN001, D103
|
32 |
+
bin_str = get_base64_of_bin_file(png_file)
|
33 |
+
page_bg_img = (
|
34 |
+
"""
|
35 |
+
<style>
|
36 |
+
.stApp {
|
37 |
+
background-image: url("data:image/png;base64,%s");
|
38 |
+
background-size: cover;
|
39 |
+
}
|
40 |
+
</style>
|
41 |
+
""" # noqa: UP031
|
42 |
+
% bin_str
|
43 |
+
)
|
44 |
+
st.markdown(page_bg_img, unsafe_allow_html=True)
|
45 |
+
return # noqa: PLR1711
|
46 |
+
|
47 |
+
|
48 |
+
# Set background
|
49 |
+
set_png_as_page_bg(png_file=r"app\resources\aide-financiere-ademe.jpg")
|
50 |
+
|
51 |
+
logo_path = r"app\resources\logo_ademe.png"
|
52 |
+
|
53 |
+
col1, col2 = st.columns([3, 2])
|
54 |
+
with col1:
|
55 |
+
st.image(logo_path, width=400)
|
56 |
+
with col2:
|
57 |
+
st.title("Dis-ADEME")
|
58 |
+
st.write("Bienvenue dans votre application de chat.")
|
59 |
+
|
60 |
+
# Navigation
|
61 |
+
st.sidebar.title("Menu")
|
62 |
+
page = st.sidebar.radio("Navigation", ["Accueil", "Admin"])
|
63 |
+
|
64 |
+
|
65 |
+
def save_uploaded_files(uploaded_files: list): # noqa: ANN201, D103
|
66 |
+
save_dir = BASE_DIR + r"\uploaded_files\user"
|
67 |
+
# save_dir = r"\Shared_data\uploaded_files"
|
68 |
+
if not os.path.exists(save_dir):
|
69 |
+
os.makedirs(save_dir)
|
70 |
+
|
71 |
+
saved_file_paths = []
|
72 |
+
for uploaded_file in uploaded_files:
|
73 |
+
file_path = os.path.join(save_dir, uploaded_file.name)
|
74 |
+
with open(file_path, "wb") as f:
|
75 |
+
f.write(uploaded_file.getbuffer())
|
76 |
+
saved_file_paths.append(file_path)
|
77 |
+
st.session_state.uploaded_files.append(file_path)
|
78 |
+
|
79 |
+
return saved_file_paths
|
80 |
+
|
81 |
+
|
82 |
+
# Page d'accueil
|
83 |
+
if page == "Accueil":
|
84 |
+
if "uploaded_files" not in st.session_state:
|
85 |
+
st.session_state.uploaded_files = []
|
86 |
+
|
87 |
+
if "messages" not in st.session_state:
|
88 |
+
st.session_state.messages = []
|
89 |
+
|
90 |
+
saved_paths = []
|
91 |
+
with st.sidebar:
|
92 |
+
st.header("Uploader des fichiers PDF")
|
93 |
+
uploaded_files = st.file_uploader(
|
94 |
+
"Choisissez des fichiers PDF",
|
95 |
+
type="pdf",
|
96 |
+
accept_multiple_files=True,
|
97 |
+
key="pdf_uploader",
|
98 |
+
)
|
99 |
+
|
100 |
+
if uploaded_files:
|
101 |
+
saved_paths = save_uploaded_files(uploaded_files)
|
102 |
+
st.success(f"Fichiers sauvegardés : {saved_paths[-1]}, en analyse ...")
|
103 |
+
|
104 |
+
if saved_paths:
|
105 |
+
try:
|
106 |
+
response = requests.post(
|
107 |
+
API_URL_EMBEDDING,
|
108 |
+
json={"doc_paths": saved_paths[-1], "vectorstor_type": "user"},
|
109 |
+
)
|
110 |
+
response.raise_for_status()
|
111 |
+
embedded = response.json().get(
|
112 |
+
"message",
|
113 |
+
"Désolé, une erreur s'est produite durant la lecture du fichier.",
|
114 |
+
)
|
115 |
+
|
116 |
+
if response:
|
117 |
+
st.success(f"Analyse du fichiers {saved_paths[-1]} terminée.")
|
118 |
+
|
119 |
+
saved_paths = []
|
120 |
+
except requests.RequestException as e:
|
121 |
+
embedded = f"Erreur lors de la communication avec l'API : {e}"
|
122 |
+
|
123 |
+
if st.session_state.messages:
|
124 |
+
st.write("")
|
125 |
+
st.divider()
|
126 |
+
st.write("")
|
127 |
+
st.header("Rapport de conversation")
|
128 |
+
if st.button("Générer le rapport de conversation"):
|
129 |
+
try:
|
130 |
+
response = requests.post(
|
131 |
+
API_URL_SUM, json={"messages": st.session_state.messages}
|
132 |
+
)
|
133 |
+
response.raise_for_status()
|
134 |
+
summary = response.json().get("summary", "Résumé non disponible.")
|
135 |
+
st.subheader("Résumé généré")
|
136 |
+
st.text_area("Rapport", summary, height=200)
|
137 |
+
except requests.exceptions.RequestException as e:
|
138 |
+
st.error(f"Erreur lors de l'appel de l'API : {e}")
|
139 |
+
if response:
|
140 |
+
with open(r"..\Shared_data\export.pdf", "rb") as pdf_file:
|
141 |
+
# with open(r"C:\Users\jeanb\Documents\kzs-team\Shared_data\export.pdf", "rb") as pdf_file:
|
142 |
+
|
143 |
+
PDFbyte = pdf_file.read()
|
144 |
+
|
145 |
+
if PDFbyte:
|
146 |
+
st.download_button(
|
147 |
+
label="Télécharger le rapport de conversation",
|
148 |
+
data=PDFbyte,
|
149 |
+
file_name="Conversation_Dis_ADEME.pdf",
|
150 |
+
mime="application/octet-stream",
|
151 |
+
)
|
152 |
+
|
153 |
+
# Chatbot
|
154 |
+
for message in st.session_state.messages:
|
155 |
+
with st.chat_message(message["role"], avatar=message["avatar"]):
|
156 |
+
st.write(message["content"])
|
157 |
+
|
158 |
+
if prompt := st.chat_input("Comment puis-je vous aider ?"):
|
159 |
+
st.session_state.messages.append(
|
160 |
+
{"role": "user", "content": prompt, "avatar": "👤"}
|
161 |
+
)
|
162 |
+
with st.chat_message("user", avatar="👤"):
|
163 |
+
st.write(prompt)
|
164 |
+
|
165 |
+
try:
|
166 |
+
response = requests.post(API_URL_CHAT, json={"user_query": prompt})
|
167 |
+
response.raise_for_status()
|
168 |
+
data = response.json()
|
169 |
+
answer = data.get(
|
170 |
+
"formatted_output", "Désolé, je n'ai pas de réponse à cette question."
|
171 |
+
)
|
172 |
+
except requests.RequestException as e:
|
173 |
+
answer = f"Erreur lors de la communication avec l'API : {e}"
|
174 |
+
|
175 |
+
st.session_state.messages.append(
|
176 |
+
{"role": "assistant", "content": answer, "avatar": "🤖"}
|
177 |
+
)
|
178 |
+
with st.chat_message("assistant", avatar="🤖"):
|
179 |
+
st.write(answer)
|
180 |
+
|
181 |
+
# Page Admin
|
182 |
+
elif page == "Admin":
|
183 |
+
st.title("Admin - Ajouter des documents à la base de données")
|
184 |
+
|
185 |
+
doc_path = st.text_input("Entrez le chemin du document ou du dossier à ajouter")
|
186 |
+
|
187 |
+
if st.button("Ajouter les documents PDF à la base de données"):
|
188 |
+
if doc_path:
|
189 |
+
print("SAVED DOC:", doc_path)
|
190 |
+
try:
|
191 |
+
response = requests.post(
|
192 |
+
API_URL_EMBEDDING,
|
193 |
+
json={"doc_paths": doc_path, "vectorstor_type": "doc"},
|
194 |
+
)
|
195 |
+
response.raise_for_status()
|
196 |
+
st.success("Documents ajoutés à la base de données avec succès.")
|
197 |
+
except requests.RequestException as e:
|
198 |
+
st.error(f"Erreur lors de l'ajout des documents : {e}")
|
199 |
+
else:
|
200 |
+
st.warning("Veuillez entrer un chemin valide.")
|
frontend/app/resources/aide-financiere-ademe.JPG
ADDED
|
frontend/app/resources/logo_ademe.png
ADDED
![]() |
frontend/app/settings.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
2 |
+
|
3 |
+
|
4 |
+
class Settings(BaseSettings):
|
5 |
+
# model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8")
|
6 |
+
api_url_chat: str = "http://backend/chatting/chat"
|
7 |
+
api_url_embedding: str = "http://backend/embeddings/embedded"
|
8 |
+
api_url_sum: str = "http://backend/chatting/summary"
|
9 |
+
|
10 |
+
|
11 |
+
settings = Settings()
|
frontend/test/test_main.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Basic test."""
|
2 |
+
|
3 |
+
|
4 |
+
def test_basic() -> None:
|
5 |
+
"""Test that 1 + 1 equals 2."""
|
6 |
+
assert 1 + 1 == 2
|
nginx.conf
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
user user;
|
2 |
+
worker_processes 1;
|
3 |
+
|
4 |
+
events {
|
5 |
+
worker_connections 1024;
|
6 |
+
}
|
7 |
+
|
8 |
+
http {
|
9 |
+
server {
|
10 |
+
listen 80;
|
11 |
+
|
12 |
+
location /api/ {
|
13 |
+
proxy_pass http://127.0.0.1:8000/;
|
14 |
+
proxy_set_header Host $host;
|
15 |
+
proxy_set_header X-Real-IP $remote_addr;
|
16 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
17 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
18 |
+
}
|
19 |
+
|
20 |
+
location / {
|
21 |
+
proxy_pass http://127.0.0.1:8501/;
|
22 |
+
proxy_set_header Host $host;
|
23 |
+
proxy_set_header X-Real-IP $remote_addr;
|
24 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
25 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
26 |
+
}
|
27 |
+
|
28 |
+
location /qdrant/ {
|
29 |
+
proxy_pass http://127.0.0.1:6333/;
|
30 |
+
proxy_set_header Host $host;
|
31 |
+
proxy_set_header X-Real-IP $remote_addr;
|
32 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
33 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
34 |
+
}
|
35 |
+
}
|
36 |
+
}
|
pyproject.toml
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[project]
|
2 |
+
name = "kzs-rag"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = "RAG project by kaizen"
|
5 |
+
readme = "readme.md"
|
6 |
+
requires-python = ">=3.12,<3.13"
|
7 |
+
dependencies = [
|
8 |
+
"pydantic-settings>=2.6.1",
|
9 |
+
]
|
10 |
+
|
11 |
+
[dependency-groups]
|
12 |
+
frontend = [
|
13 |
+
"streamlit>=1.40.1",
|
14 |
+
]
|
15 |
+
|
16 |
+
backend = [
|
17 |
+
"docling>=2.8.1",
|
18 |
+
"fastapi[standard]>=0.115.4",
|
19 |
+
"langchain-community>=0.3.8",
|
20 |
+
"langchain-openai>=0.2.10",
|
21 |
+
"langchain-qdrant>=0.2.0",
|
22 |
+
"langgraph>=0.2.53",
|
23 |
+
"qdrant-client>=1.12.1",
|
24 |
+
"sentence-transformers>=3.3.1",
|
25 |
+
"openparse>=0.7.0",
|
26 |
+
# "fpdf>=1.7.2",
|
27 |
+
"fpdf2>=2.8.1",
|
28 |
+
]
|
29 |
+
|
30 |
+
dev = [
|
31 |
+
"mypy>=1.13.0",
|
32 |
+
"pytest>=8.3.3",
|
33 |
+
"ruff>=0.7.1",
|
34 |
+
"pytest-forked>=1.6.0",
|
35 |
+
"pytest-gitignore>=1.3",
|
36 |
+
"pytest-html>=4.1.1",
|
37 |
+
"pytest-xdist>=3.6.1",
|
38 |
+
"pandas>=2.2.3",
|
39 |
+
"pandas-stubs>=2.2.3.241009",
|
40 |
+
# "gitlabci-local>=10.2.0",
|
41 |
+
"plotly>=5.24.1",
|
42 |
+
"ipykernel>=6.29.5",
|
43 |
+
]
|
44 |
+
|
45 |
+
[tool.ruff]
|
46 |
+
target-version = "py312"
|
47 |
+
fix = false
|
48 |
+
line-length = 88 # Same as Black
|
49 |
+
exclude = [
|
50 |
+
".git",
|
51 |
+
".git-rewrite",
|
52 |
+
".mypy_cache",
|
53 |
+
".pytype",
|
54 |
+
".ruff_cache",
|
55 |
+
"__pypackages__",
|
56 |
+
".venv"
|
57 |
+
]
|
58 |
+
|
59 |
+
[tool.ruff.lint]
|
60 |
+
fixable = ["ALL"] # Allow autofix for all enabled rules
|
61 |
+
unfixable = []
|
62 |
+
# Rule selection
|
63 |
+
select = [
|
64 |
+
"F", "E", "C90", "N", "D", "UP", "YTT", "ANN", "ASYNC", "S", "BLE",
|
65 |
+
"FBT", "B", "A", "C4", "DTZ", "T10", "DJ", "EXE", "FA", "ISC",
|
66 |
+
"ICN", "G", "INP", "PIE", "PYI", "PT", "Q", "RSE", "RET", "SLF", "SLOT", "SIM",
|
67 |
+
"TID", "TCH", "INT", "ARG", "PTH", "TD", "FIX", "ERA", "PD", "PGH", "PL", "TRY",
|
68 |
+
"FLY", "NPY", "AIR", "PERF", "RUF", "T20", "I"
|
69 |
+
]
|
70 |
+
# Not selected:
|
71 |
+
# - CPY (flake8-copyright) no need of a copyright per file
|
72 |
+
# - COM (flake8-commas) handled by ruff
|
73 |
+
# - EM (flake8-errmsg) too little gain for the cost
|
74 |
+
|
75 |
+
ignore = [
|
76 |
+
"D203", # 1 blank line required before class docstring
|
77 |
+
"D212", # Multi-line docstring summary should start at the first line
|
78 |
+
"TRY003", # Avoid specifying long messages outside the exception class
|
79 |
+
"ANN101", # Missing type annotation for self in method
|
80 |
+
"ANN102", # Missing type annotation for cls in classmethod
|
81 |
+
"G004", # Logging statement uses f-string
|
82 |
+
"PD013", # `.melt` is preferred to `.stack`; provides same functionality (WRONG!)
|
83 |
+
]
|
84 |
+
|
85 |
+
|
86 |
+
[tool.ruff.lint.per-file-ignores]
|
87 |
+
"__init__.py" = ["D104"] # Ignore "missing docstring in public package" in all `__init__.py` files
|
88 |
+
"test/**/*.py" = [ # Ignore rules necessary for tests
|
89 |
+
"INP001", # Ignore "File is part of an implicit namespace package. Add an `__init__.py`."
|
90 |
+
"S101", # Ignore "Use of `assert` detected" because pytest relies on assert
|
91 |
+
"N802", # Ignore "Function name should be lowercase" because test function are non-standard
|
92 |
+
# "ARG", # Unused function args -> fixtures nevertheless are functionally relevant...
|
93 |
+
# "FBT", # Don't care about booleans as positional arguments in tests, e.g. via @pytest.mark.parametrize()
|
94 |
+
"PLR2004", # Ignore "Magic value used in comparison"
|
95 |
+
# "S311", # Standard pseudo-random generators are not suitable for cryptographic purposes
|
96 |
+
]
|
97 |
+
|
98 |
+
|
99 |
+
[tool.ruff.lint.flake8-annotations]
|
100 |
+
mypy-init-return = true
|
101 |
+
|
102 |
+
[tool.ruff.lint.pydocstyle]
|
103 |
+
convention = "numpy"
|
104 |
+
|
105 |
+
|
106 |
+
[tool.mypy]
|
107 |
+
python_version = "3.12"
|
108 |
+
exclude = [
|
109 |
+
]
|
110 |
+
|
111 |
+
|
112 |
+
[[tool.mypy.overrides]]
|
113 |
+
module = [""
|
114 |
+
]
|
115 |
+
ignore_missing_imports = true
|
116 |
+
|
117 |
+
[tool.pytest.ini_options]
|
118 |
+
|
119 |
+
addopts = [
|
120 |
+
"--import-mode=prepend",
|
121 |
+
"-vv",
|
122 |
+
"--exitfirst",
|
123 |
+
"--capture=no",
|
124 |
+
"--showlocals",
|
125 |
+
# "--forked",
|
126 |
+
# "--cov-config=.coverage/coveragerc",
|
127 |
+
# "--cov=src",
|
128 |
+
# "--cov=app",
|
129 |
+
# "--cov-report=html",
|
130 |
+
"--html=.pytest_cache/report.html",
|
131 |
+
]
|
132 |
+
|
133 |
+
python_files = "*.py"
|
134 |
+
|
135 |
+
norecursedirs = [
|
136 |
+
"dist",
|
137 |
+
"doc",
|
138 |
+
"__pycache__",
|
139 |
+
]
|
140 |
+
|
141 |
+
[tool.pymarkdown]
|
142 |
+
# plugins.line-length.line_length = 88
|
143 |
+
# plugins.ul-style.style = "sublist"
|
144 |
+
# extensions.front-matter.enabled = true
|
supervisord.conf
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[supervisord]
|
2 |
+
nodaemon=true
|
3 |
+
|
4 |
+
[program:nginx]
|
5 |
+
command=/usr/sbin/nginx -g "daemon off;"
|
6 |
+
autostart=true
|
7 |
+
autorestart=true
|
8 |
+
user=user
|
9 |
+
|
10 |
+
[program:qdrant]
|
11 |
+
command=/usr/local/bin/qdrant
|
12 |
+
autostart=true
|
13 |
+
autorestart=true
|
14 |
+
user=user
|
15 |
+
|
16 |
+
[program:backend]
|
17 |
+
command=uv run fastapi app.main:app --host 0.0.0.0 --port 8000
|
18 |
+
directory=/home/user/app/backend
|
19 |
+
autostart=true
|
20 |
+
autorestart=true
|
21 |
+
user=user
|
22 |
+
|
23 |
+
[program:frontend]
|
24 |
+
command=uv run streamlit run app/main.py --server.port 8501 --server.address=0.0.0.0
|
25 |
+
directory=/home/user/app/frontend
|
26 |
+
autostart=true
|
27 |
+
autorestart=true
|
28 |
+
user=user
|
uv.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|