Spaces:
Sleeping
Sleeping
File size: 2,554 Bytes
2e85929 dc78e2e 3d3f248 dc78e2e 2e85929 43fee5b 2e85929 43fee5b dc78e2e 43fee5b dc78e2e 43fee5b dc78e2e 3d3f248 43fee5b dc78e2e 43fee5b dc78e2e 3d3f248 dc78e2e 2e85929 dc78e2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
"""
This file es responsible for the ingestion of the data (langchain documentation).
It embedds the data into vectors, and stores it in the pinecone vectorstore.
"""
from langchain.document_loaders import ReadTheDocsLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import os
import pinecone
from consts import INDEX_NAME
# initialize pinecone client
pinecone.init(api_key=os.environ["PINECONE_API_KEY"],
environment=os.environ["PINECONE_ENVIRONMENT"])
# The ingestion process is divided into 3 steps:
# 1. Load the documents from the source (ReadTheDocsLoader)
# 2. Split the documents into chunks (RecursiveCharacterTextSplitter)
# 3. Embed the chunks into vectors and store them in the vectorstore (Pinecone.from_documents)
def ingest_docs() -> None:
# The ReadTheDocsLoader is a class that is in charge of taking the dump of some scrapped data-fetching
# process and loading it into the vectorstore.
loader = ReadTheDocsLoader(
"langchain-docs/langchain.readthedocs.io/en/latest/"
)
# loader.load() -> [documents] (documents are just dictionaries)
raw_documents = loader.load()
print(f"Loaded {len(raw_documents)} documents")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""])
# Execute splitter, to allow parallelization of the embedding process.
documents = text_splitter.split_documents(documents=raw_documents)
print(f"Split {len(documents)} documents into chunks")
# Simple dictionary manipulation to change the source path of the documents, to a valid langchain docs page.
# This will enable us later to have easy access to the "relevant" context. (proximity search)
for doc in documents:
old_path = doc.metadata["source"]
new_url = old_path.replace(
"langchain-docs/", "https:/")
doc.metadata.update({"source": new_url})
print(f"Uploading {len(documents)} documents to vectorstore (pinecone)")
# The embeddings object is in charge of embedding the documents into vectors.
embeddings = OpenAIEmbeddings()
# Take the chunks, imbed them into vectors and store them in the Pinecone vector database.
Pinecone.from_documents(documents,
embeddings, index_name=INDEX_NAME)
print("*********Added documents to Pinecone*********")
if __name__ == '__main__':
ingest_docs()
|