Spaces:

Petermoyano
/

langchain-docs-chatbot

Sleeping

File size: 2,554 Bytes

2e85929
 
 
 
 
dc78e2e
 
 
 
 
 
3d3f248
dc78e2e
 
 
 
 
 
 
 
 
2e85929
 
 
43fee5b
 
 
 
 
 
 
2e85929
43fee5b
dc78e2e
43fee5b
dc78e2e
 
 
43fee5b
dc78e2e
 
 
3d3f248
43fee5b
 
dc78e2e
 
43fee5b
 
dc78e2e
 
 
 
 
 
 
 
3d3f248
dc78e2e
2e85929
 
 
dc78e2e

"""
This file es responsible for the ingestion of the data (langchain documentation).
It embedds the data into vectors, and stores it in the pinecone vectorstore.
"""
from langchain.document_loaders import ReadTheDocsLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone

import os
import pinecone
from consts import INDEX_NAME

# initialize pinecone client
pinecone.init(api_key=os.environ["PINECONE_API_KEY"],
              environment=os.environ["PINECONE_ENVIRONMENT"])

# The ingestion process is divided into 3 steps:
# 1. Load the documents from the source (ReadTheDocsLoader)
# 2. Split the documents into chunks (RecursiveCharacterTextSplitter)
# 3. Embed the chunks into vectors and store them in the vectorstore (Pinecone.from_documents)


def ingest_docs() -> None:
    # The ReadTheDocsLoader is a class that is in charge of taking the dump of some scrapped data-fetching
    #  process and loading it into the vectorstore.
    loader = ReadTheDocsLoader(
        "langchain-docs/langchain.readthedocs.io/en/latest/"
    )

    # loader.load() -> [documents] (documents are just dictionaries)
    raw_documents = loader.load()

    print(f"Loaded {len(raw_documents)} documents")

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""])

    # Execute splitter, to allow parallelization of the embedding process.
    documents = text_splitter.split_documents(documents=raw_documents)

    print(f"Split {len(documents)} documents into chunks")

    # Simple dictionary manipulation to change the source path of the documents, to a valid langchain docs page.
    # This will enable us later to have easy access to the "relevant" context. (proximity search)
    for doc in documents:
        old_path = doc.metadata["source"]
        new_url = old_path.replace(
            "langchain-docs/", "https:/")
        doc.metadata.update({"source": new_url})

    print(f"Uploading {len(documents)} documents to vectorstore (pinecone)")
    # The embeddings object is in charge of embedding the documents into vectors.
    embeddings = OpenAIEmbeddings()

    # Take the chunks, imbed them into vectors and store them in the Pinecone vector database.
    Pinecone.from_documents(documents,
                            embeddings, index_name=INDEX_NAME)
    print("*********Added documents to Pinecone*********")


if __name__ == '__main__':
    ingest_docs()