Spaces:
Sleeping
Sleeping
Commit
·
dc78e2e
1
Parent(s):
2e85929
add ingestion file
Browse files- ingestion.py +44 -4
ingestion.py
CHANGED
@@ -2,16 +2,56 @@
|
|
2 |
This file es responsible for the ingestion of the data (langchain documentation).
|
3 |
It embedds the data into vectors, and stores it in the pinecone vectorstore.
|
4 |
"""
|
5 |
-
import os
|
6 |
from langchain.document_loaders import ReadTheDocsLoader
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
def ingest_docs() -> None:
|
10 |
-
# The ReadTheDocsLoader is a class that is in charge of taking the dump of some data
|
11 |
# fetching process and loading it into the vectorstore.
|
12 |
-
loader = ReadTheDocsLoader("langchain-docs
|
|
|
|
|
13 |
raw_documents = loader.load()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
|
16 |
if __name__ == '__main__':
|
17 |
-
|
|
|
2 |
This file es responsible for the ingestion of the data (langchain documentation).
|
3 |
It embedds the data into vectors, and stores it in the pinecone vectorstore.
|
4 |
"""
|
|
|
5 |
from langchain.document_loaders import ReadTheDocsLoader
|
6 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
7 |
+
from langchain.embeddings import OpenAIEmbeddings
|
8 |
+
from langchain.vectorstores import Pinecone
|
9 |
+
|
10 |
+
import os
|
11 |
+
import pinecone
|
12 |
+
|
13 |
+
# initialize pinecone client
|
14 |
+
pinecone.init(api_key=os.environ["PINECONE_API_KEY"],
|
15 |
+
environment=os.environ["PINECONE_ENVIRONMENT"])
|
16 |
+
|
17 |
+
# The ingestion process is divided into 3 steps:
|
18 |
+
# 1. Load the documents from the source (ReadTheDocsLoader)
|
19 |
+
# 2. Split the documents into chunks (RecursiveCharacterTextSplitter)
|
20 |
+
# 3. Embed the chunks into vectors and store them in the vectorstore (Pinecone.from_documents)
|
21 |
|
22 |
|
23 |
def ingest_docs() -> None:
|
24 |
+
# The ReadTheDocsLoader is a class that is in charge of taking the dump of some scrapped data
|
25 |
# fetching process and loading it into the vectorstore.
|
26 |
+
loader = ReadTheDocsLoader("langchain-docs/")
|
27 |
+
# The load methos returns a list of documents, which are the objects that are going to be
|
28 |
+
# raw_documents is a list of dictionaries, each dictionary represents a document object.
|
29 |
raw_documents = loader.load()
|
30 |
+
print(f"Loaded {len(raw_documents)} documents")
|
31 |
+
# gpt-3.5-turbo has a 4096 token limit (query + result), so we need to split the documents into chunks.
|
32 |
+
# A good rule of thumb is to split the documents into 5 chunks
|
33 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
34 |
+
chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""])
|
35 |
+
|
36 |
+
# Take the langchain raw documents and split them into chunks.
|
37 |
+
documents = text_splitter.split_documents(documents=raw_documents)
|
38 |
+
|
39 |
+
print(f"Split {len(documents)} documents into chunks")
|
40 |
+
# Simple dictionary manipulation to change the source path of the documents, to a valid url.
|
41 |
+
for doc in documents:
|
42 |
+
old_path = doc.metadata["source"]
|
43 |
+
new_url = old_path.replace("langchain-docs", "https:/")
|
44 |
+
doc.metadata.update({"source": new_url})
|
45 |
+
|
46 |
+
print(f"Uploading {len(documents)} documents to vectorstore (pinecone)")
|
47 |
+
# The embeddings object is in charge of embedding the documents into vectors.
|
48 |
+
embeddings = OpenAIEmbeddings()
|
49 |
+
|
50 |
+
# Take the chunks, imbed them into vectors and store them in the Pinecone vector database.
|
51 |
+
Pinecone.from_documents(documents,
|
52 |
+
embeddings, index_name="langchain-docs-index")
|
53 |
+
print("*********Added documents to Pinecone*********")
|
54 |
|
55 |
|
56 |
if __name__ == '__main__':
|
57 |
+
ingest_docs()
|