MohamedLamineBamba
commited on
Commit
·
0c69aa1
1
Parent(s):
a7aa9c3
feat: Parent Docuement Retriever
Browse files- app.py +29 -25
- requirements.txt +2 -1
- scrape_data.py +1 -18
app.py
CHANGED
@@ -11,24 +11,12 @@ from langchain_google_genai import (
|
|
11 |
HarmBlockThreshold,
|
12 |
HarmCategory,
|
13 |
)
|
14 |
-
|
|
|
|
|
15 |
import config
|
16 |
from prompts import prompt
|
17 |
-
|
18 |
-
metadata_field_info = [
|
19 |
-
AttributeInfo(
|
20 |
-
name="title",
|
21 |
-
description="Le titre de l'article",
|
22 |
-
type="string",
|
23 |
-
),
|
24 |
-
AttributeInfo(
|
25 |
-
name="date",
|
26 |
-
description="Date de publication",
|
27 |
-
type="string",
|
28 |
-
),
|
29 |
-
AttributeInfo(name="link", description="Source de l'article", type="string"),
|
30 |
-
]
|
31 |
-
document_content_description = "Articles sur l'actualité."
|
32 |
|
33 |
model = GoogleGenerativeAI(
|
34 |
model=config.GOOGLE_CHAT_MODEL,
|
@@ -45,29 +33,45 @@ embedding = embeddings_model = GoogleGenerativeAIEmbeddings(
|
|
45 |
|
46 |
vectordb = Chroma(persist_directory=config.STORAGE_PATH, embedding_function=embedding)
|
47 |
|
48 |
-
retriever
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
|
56 |
@cl.on_chat_start
|
57 |
async def on_chat_start():
|
58 |
|
59 |
-
def format_docs(
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
rag_chain = (
|
63 |
{
|
64 |
-
"context":
|
65 |
"question": RunnablePassthrough(),
|
66 |
}
|
67 |
| prompt
|
68 |
| model
|
69 |
| StrOutputParser()
|
70 |
)
|
|
|
71 |
|
72 |
cl.user_session.set("rag_chain", rag_chain)
|
73 |
|
|
|
11 |
HarmBlockThreshold,
|
12 |
HarmCategory,
|
13 |
)
|
14 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
15 |
+
from langchain.retrievers import ParentDocumentRetriever
|
16 |
+
from langchain.storage import InMemoryStore
|
17 |
import config
|
18 |
from prompts import prompt
|
19 |
+
import tiktoken
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
model = GoogleGenerativeAI(
|
22 |
model=config.GOOGLE_CHAT_MODEL,
|
|
|
33 |
|
34 |
vectordb = Chroma(persist_directory=config.STORAGE_PATH, embedding_function=embedding)
|
35 |
|
36 |
+
## retriever
|
37 |
+
|
38 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, separators=["\n"])
|
39 |
+
|
40 |
+
|
41 |
+
# The storage layer for the parent documents
|
42 |
+
store = InMemoryStore()
|
43 |
+
retriever = ParentDocumentRetriever(
|
44 |
+
vectorstore=vectordb,
|
45 |
+
docstore=store,
|
46 |
+
child_splitter=text_splitter,
|
47 |
+
)
|
48 |
+
|
49 |
|
50 |
|
51 |
@cl.on_chat_start
|
52 |
async def on_chat_start():
|
53 |
|
54 |
+
def format_docs(documents, max_context_size= 100000, separator= "\n\n"):
|
55 |
+
context = ""
|
56 |
+
encoder = tiktoken.get_encoding("cl100k_base")
|
57 |
+
i=0
|
58 |
+
for doc in documents:
|
59 |
+
i+=1
|
60 |
+
if len(encoder.encode(context)) < max_context_size:
|
61 |
+
source = doc.metadata['link']
|
62 |
+
context += f"Article{i}:\n"+doc.page_content + f"\nSource: {source}" + separator
|
63 |
+
return context
|
64 |
|
65 |
rag_chain = (
|
66 |
{
|
67 |
+
"context": retriever | format_docs,
|
68 |
"question": RunnablePassthrough(),
|
69 |
}
|
70 |
| prompt
|
71 |
| model
|
72 |
| StrOutputParser()
|
73 |
)
|
74 |
+
|
75 |
|
76 |
cl.user_session.set("rag_chain", rag_chain)
|
77 |
|
requirements.txt
CHANGED
@@ -4,4 +4,5 @@ chainlit==1.0.500
|
|
4 |
chromadb==0.4.24
|
5 |
lark==1.1.9
|
6 |
bs4==0.0.2
|
7 |
-
selenium==4.19.0
|
|
|
|
4 |
chromadb==0.4.24
|
5 |
lark==1.1.9
|
6 |
bs4==0.0.2
|
7 |
+
selenium==4.19.0
|
8 |
+
tiktoken==0.1.1
|
scrape_data.py
CHANGED
@@ -120,24 +120,7 @@ def process_docs(
|
|
120 |
documents=splits,
|
121 |
embedding=embeddings_model,
|
122 |
persist_directory=persist_directory,
|
123 |
-
)
|
124 |
-
|
125 |
-
# Indexing data
|
126 |
-
namespace = "chromadb/my_documents"
|
127 |
-
record_manager = SQLRecordManager(
|
128 |
-
namespace, db_url="sqlite:///record_manager_cache.sql"
|
129 |
-
)
|
130 |
-
record_manager.create_schema()
|
131 |
-
|
132 |
-
index_result = index(
|
133 |
-
docs,
|
134 |
-
record_manager,
|
135 |
-
doc_search,
|
136 |
-
cleanup="incremental",
|
137 |
-
source_id_key="link",
|
138 |
-
)
|
139 |
-
|
140 |
-
print(f"Indexing stats: {index_result}")
|
141 |
|
142 |
return doc_search
|
143 |
|
|
|
120 |
documents=splits,
|
121 |
embedding=embeddings_model,
|
122 |
persist_directory=persist_directory,
|
123 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
return doc_search
|
126 |
|