Petermoyano commited on
Commit
dc78e2e
·
1 Parent(s): 2e85929

add ingestion file

Browse files
Files changed (1) hide show
  1. ingestion.py +44 -4
ingestion.py CHANGED
@@ -2,16 +2,56 @@
2
  This file es responsible for the ingestion of the data (langchain documentation).
3
  It embedds the data into vectors, and stores it in the pinecone vectorstore.
4
  """
5
- import os
6
  from langchain.document_loaders import ReadTheDocsLoader
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  def ingest_docs() -> None:
10
- # The ReadTheDocsLoader is a class that is in charge of taking the dump of some data
11
  # fetching process and loading it into the vectorstore.
12
- loader = ReadTheDocsLoader("langchain-docs-chatbot/langchain-docs")
 
 
13
  raw_documents = loader.load()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
 
16
  if __name__ == '__main__':
17
- print('Hello world!')
 
2
  This file es responsible for the ingestion of the data (langchain documentation).
3
  It embedds the data into vectors, and stores it in the pinecone vectorstore.
4
  """
 
5
  from langchain.document_loaders import ReadTheDocsLoader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain.embeddings import OpenAIEmbeddings
8
+ from langchain.vectorstores import Pinecone
9
+
10
+ import os
11
+ import pinecone
12
+
13
+ # initialize pinecone client
14
+ pinecone.init(api_key=os.environ["PINECONE_API_KEY"],
15
+ environment=os.environ["PINECONE_ENVIRONMENT"])
16
+
17
+ # The ingestion process is divided into 3 steps:
18
+ # 1. Load the documents from the source (ReadTheDocsLoader)
19
+ # 2. Split the documents into chunks (RecursiveCharacterTextSplitter)
20
+ # 3. Embed the chunks into vectors and store them in the vectorstore (Pinecone.from_documents)
21
 
22
 
23
  def ingest_docs() -> None:
24
+ # The ReadTheDocsLoader is a class that is in charge of taking the dump of some scrapped data
25
  # fetching process and loading it into the vectorstore.
26
+ loader = ReadTheDocsLoader("langchain-docs/")
27
+ # The load methos returns a list of documents, which are the objects that are going to be
28
+ # raw_documents is a list of dictionaries, each dictionary represents a document object.
29
  raw_documents = loader.load()
30
+ print(f"Loaded {len(raw_documents)} documents")
31
+ # gpt-3.5-turbo has a 4096 token limit (query + result), so we need to split the documents into chunks.
32
+ # A good rule of thumb is to split the documents into 5 chunks
33
+ text_splitter = RecursiveCharacterTextSplitter(
34
+ chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""])
35
+
36
+ # Take the langchain raw documents and split them into chunks.
37
+ documents = text_splitter.split_documents(documents=raw_documents)
38
+
39
+ print(f"Split {len(documents)} documents into chunks")
40
+ # Simple dictionary manipulation to change the source path of the documents, to a valid url.
41
+ for doc in documents:
42
+ old_path = doc.metadata["source"]
43
+ new_url = old_path.replace("langchain-docs", "https:/")
44
+ doc.metadata.update({"source": new_url})
45
+
46
+ print(f"Uploading {len(documents)} documents to vectorstore (pinecone)")
47
+ # The embeddings object is in charge of embedding the documents into vectors.
48
+ embeddings = OpenAIEmbeddings()
49
+
50
+ # Take the chunks, imbed them into vectors and store them in the Pinecone vector database.
51
+ Pinecone.from_documents(documents,
52
+ embeddings, index_name="langchain-docs-index")
53
+ print("*********Added documents to Pinecone*********")
54
 
55
 
56
  if __name__ == '__main__':
57
+ ingest_docs()