File size: 2,554 Bytes
2e85929
 
 
 
 
dc78e2e
 
 
 
 
 
3d3f248
dc78e2e
 
 
 
 
 
 
 
 
2e85929
 
 
43fee5b
 
 
 
 
 
 
2e85929
43fee5b
dc78e2e
43fee5b
dc78e2e
 
 
43fee5b
dc78e2e
 
 
3d3f248
43fee5b
 
dc78e2e
 
43fee5b
 
dc78e2e
 
 
 
 
 
 
 
3d3f248
dc78e2e
2e85929
 
 
dc78e2e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""
This file es responsible for the ingestion of the data (langchain documentation).
It embedds the data into vectors, and stores it in the pinecone vectorstore.
"""
from langchain.document_loaders import ReadTheDocsLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone

import os
import pinecone
from consts import INDEX_NAME

# initialize pinecone client
pinecone.init(api_key=os.environ["PINECONE_API_KEY"],
              environment=os.environ["PINECONE_ENVIRONMENT"])

# The ingestion process is divided into 3 steps:
# 1. Load the documents from the source (ReadTheDocsLoader)
# 2. Split the documents into chunks (RecursiveCharacterTextSplitter)
# 3. Embed the chunks into vectors and store them in the vectorstore (Pinecone.from_documents)


def ingest_docs() -> None:
    # The ReadTheDocsLoader is a class that is in charge of taking the dump of some scrapped data-fetching
    #  process and loading it into the vectorstore.
    loader = ReadTheDocsLoader(
        "langchain-docs/langchain.readthedocs.io/en/latest/"
    )

    # loader.load() -> [documents] (documents are just dictionaries)
    raw_documents = loader.load()

    print(f"Loaded {len(raw_documents)} documents")

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""])

    # Execute splitter, to allow parallelization of the embedding process.
    documents = text_splitter.split_documents(documents=raw_documents)

    print(f"Split {len(documents)} documents into chunks")

    # Simple dictionary manipulation to change the source path of the documents, to a valid langchain docs page.
    # This will enable us later to have easy access to the "relevant" context. (proximity search)
    for doc in documents:
        old_path = doc.metadata["source"]
        new_url = old_path.replace(
            "langchain-docs/", "https:/")
        doc.metadata.update({"source": new_url})

    print(f"Uploading {len(documents)} documents to vectorstore (pinecone)")
    # The embeddings object is in charge of embedding the documents into vectors.
    embeddings = OpenAIEmbeddings()

    # Take the chunks, imbed them into vectors and store them in the Pinecone vector database.
    Pinecone.from_documents(documents,
                            embeddings, index_name=INDEX_NAME)
    print("*********Added documents to Pinecone*********")


if __name__ == '__main__':
    ingest_docs()