Petermoyano commited on
Commit
43fee5b
·
1 Parent(s): 09113b6

Remove links and add debug file

Browse files
Files changed (5) hide show
  1. .vscode/launch.json +17 -0
  2. backend/__init__.py +0 -0
  3. debug.py +7 -0
  4. ingestion.py +14 -12
  5. main.py +1 -1
.vscode/launch.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // Use IntelliSense to learn about possible attributes.
3
+ // Hover to view descriptions of existing attributes.
4
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
+ "version": "0.2.0",
6
+ "configurations": [
7
+ {
8
+ "name": "Streamlit: Current File",
9
+ "type": "python",
10
+ "request": "launch",
11
+ "program": "/home/peter/.local/share/virtualenvs/langchain-docs-chatbot-xgtLqNa5/bin/streamlit run /home/peter/chatbots/langchain-docs-chatbot/main.py",
12
+ "console": "integratedTerminal",
13
+ "justMyCode": true,
14
+ "args": ["run", "main.py"]
15
+ }
16
+ ]
17
+ }
backend/__init__.py ADDED
File without changes
debug.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import main # replace 'your_streamlit_app' with the name of your main Streamlit script
2
+ import streamlit as st
3
+
4
+ # This command ensures Streamlit doesn't rerun the entire script on save
5
+ st.set_run_on_save(False)
6
+
7
+ # Now import your Streamlit app script
ingestion.py CHANGED
@@ -22,29 +22,31 @@ pinecone.init(api_key=os.environ["PINECONE_API_KEY"],
22
 
23
 
24
  def ingest_docs() -> None:
25
- # The ReadTheDocsLoader is a class that is in charge of taking the dump of some scrapped data
26
- # fetching process and loading it into the vectorstore.
27
- loader = ReadTheDocsLoader("langchain-docs/")
28
- # The load methos returns a list of documents, which are the objects that are going to be
29
- # raw_documents is a list of dictionaries, each dictionary represents a document object.
 
 
30
  raw_documents = loader.load()
 
31
  print(f"Loaded {len(raw_documents)} documents")
32
- # gpt-3.5-turbo has a 4096 token limit (query + result), so we need to split the documents into chunks.
33
- # A good rule of thumb is to split the documents into 5 chunks
34
  text_splitter = RecursiveCharacterTextSplitter(
35
  chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""])
36
 
37
- # Take the langchain raw documents and split them into chunks.
38
  documents = text_splitter.split_documents(documents=raw_documents)
39
 
40
  print(f"Split {len(documents)} documents into chunks")
41
 
42
- # Simple dictionary manipulation to change the source path of the documents, to a valid url.
43
- # This will enable us later to access what vectors (pages of langchain in this case) the RetrievalQA
44
- # chain sent to the LLM as a "relveant" context.
45
  for doc in documents:
46
  old_path = doc.metadata["source"]
47
- new_url = old_path.replace("langchain-docs", "https:/")
 
48
  doc.metadata.update({"source": new_url})
49
 
50
  print(f"Uploading {len(documents)} documents to vectorstore (pinecone)")
 
22
 
23
 
24
  def ingest_docs() -> None:
25
+ # The ReadTheDocsLoader is a class that is in charge of taking the dump of some scrapped data-fetching
26
+ # process and loading it into the vectorstore.
27
+ loader = ReadTheDocsLoader(
28
+ "langchain-docs/langchain.readthedocs.io/en/latest/"
29
+ )
30
+
31
+ # loader.load() -> [documents] (documents are just dictionaries)
32
  raw_documents = loader.load()
33
+
34
  print(f"Loaded {len(raw_documents)} documents")
35
+
 
36
  text_splitter = RecursiveCharacterTextSplitter(
37
  chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""])
38
 
39
+ # Execute splitter, to allow parallelization of the embedding process.
40
  documents = text_splitter.split_documents(documents=raw_documents)
41
 
42
  print(f"Split {len(documents)} documents into chunks")
43
 
44
+ # Simple dictionary manipulation to change the source path of the documents, to a valid langchain docs page.
45
+ # This will enable us later to have easy access to the "relevant" context. (proximity search)
 
46
  for doc in documents:
47
  old_path = doc.metadata["source"]
48
+ new_url = old_path.replace(
49
+ "langchain-docs/", "https:/")
50
  doc.metadata.update({"source": new_url})
51
 
52
  print(f"Uploading {len(documents)} documents to vectorstore (pinecone)")
main.py CHANGED
@@ -35,7 +35,7 @@ if prompt:
35
  ]
36
  )
37
  formatted_response = (
38
- f"{generated_response['result']} \n\n Sources: {create_sources_string(sources)}"
39
  )
40
 
41
  # Add the user's prompt and the chatbot's response to the session state variables.
 
35
  ]
36
  )
37
  formatted_response = (
38
+ f"{generated_response['result']}"
39
  )
40
 
41
  # Add the user's prompt and the chatbot's response to the session state variables.