Spaces:

Petermoyano
/

langchain-docs-chatbot

Sleeping

App Files Files Community

Petermoyano commited on Sep 12, 2023

Commit

43fee5b

1 Parent(s): 09113b6

Remove links and add debug file

Browse files

Files changed (5) hide show

.vscode/launch.json +17 -0
backend/__init__.py +0 -0
debug.py +7 -0
ingestion.py +14 -12
main.py +1 -1

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Streamlit: Current File",
+            "type": "python",
+            "request": "launch",
+            "program": "/home/peter/.local/share/virtualenvs/langchain-docs-chatbot-xgtLqNa5/bin/streamlit run /home/peter/chatbots/langchain-docs-chatbot/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": ["run", "main.py"]
+        }
+    ]
+}

backend/__init__.py ADDED Viewed

File without changes

debug.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import main  # replace 'your_streamlit_app' with the name of your main Streamlit script
+import streamlit as st
+# This command ensures Streamlit doesn't rerun the entire script on save
+st.set_run_on_save(False)
+# Now import your Streamlit app script

ingestion.py CHANGED Viewed

@@ -22,29 +22,31 @@ pinecone.init(api_key=os.environ["PINECONE_API_KEY"],
 def ingest_docs() -> None:
-    # The ReadTheDocsLoader is a class that is in charge of taking the dump of some scrapped data
-    # fetching process and loading it into the vectorstore.
-    loader = ReadTheDocsLoader("langchain-docs/")
-    # The load methos returns a list of documents, which are the objects that are going to be
-    # raw_documents is a list of dictionaries, each dictionary represents a document object.
     raw_documents = loader.load()
     print(f"Loaded {len(raw_documents)} documents")
-    # gpt-3.5-turbo has a 4096 token limit (query + result), so we need to split the documents into chunks.
-    # A good rule of thumb is to split the documents into 5 chunks
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""])
-    # Take the langchain raw documents and split them into chunks.
     documents = text_splitter.split_documents(documents=raw_documents)
     print(f"Split {len(documents)} documents into chunks")
-    # Simple dictionary manipulation to change the source path of the documents, to a valid url.
-    # This will enable us later to access what vectors (pages of langchain in this case) the RetrievalQA
-    # chain sent to the LLM as a "relveant" context.
     for doc in documents:
         old_path = doc.metadata["source"]
-        new_url = old_path.replace("langchain-docs", "https:/")
         doc.metadata.update({"source": new_url})
     print(f"Uploading {len(documents)} documents to vectorstore (pinecone)")

 def ingest_docs() -> None:
+    # The ReadTheDocsLoader is a class that is in charge of taking the dump of some scrapped data-fetching
+    #  process and loading it into the vectorstore.
+    loader = ReadTheDocsLoader(
+        "langchain-docs/langchain.readthedocs.io/en/latest/"
+    )
+    # loader.load() -> [documents] (documents are just dictionaries)
     raw_documents = loader.load()
     print(f"Loaded {len(raw_documents)} documents")
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""])
+    # Execute splitter, to allow parallelization of the embedding process.
     documents = text_splitter.split_documents(documents=raw_documents)
     print(f"Split {len(documents)} documents into chunks")
+    # Simple dictionary manipulation to change the source path of the documents, to a valid langchain docs page.
+    # This will enable us later to have easy access to the "relevant" context. (proximity search)
     for doc in documents:
         old_path = doc.metadata["source"]
+        new_url = old_path.replace(
+            "langchain-docs/", "https:/")
         doc.metadata.update({"source": new_url})
     print(f"Uploading {len(documents)} documents to vectorstore (pinecone)")

main.py CHANGED Viewed

@@ -35,7 +35,7 @@ if prompt:
              ]
         )
         formatted_response = (
-            f"{generated_response['result']} \n\n Sources: {create_sources_string(sources)}"
         )
         # Add the user's prompt and the chatbot's response to the session state variables.

              ]
         )
         formatted_response = (
+            f"{generated_response['result']}"
         )
         # Add the user's prompt and the chatbot's response to the session state variables.