Spaces:

towardsai-tutors
/

buster

Running

App Files Files Community

jerpint commited on Jul 26, 2023

Commit

b4b5bdf

unverified ·

1 Parent(s): c525408

Deeplake support (#1)

Browse files

* add deeplake support

* add helper functions to ingest the data via deeplake

* add script to embed documents to deeplake format

Files changed (5) hide show

cfg.py +9 -7
embed_documents.py +61 -0
gradio_app.py +4 -5
requirements.txt +1 -0
utils.py +53 -0

cfg.py CHANGED Viewed

@@ -1,16 +1,16 @@
-import os
 import logging
-from huggingface_hub import hf_hub_download
 from buster.busterbot import Buster, BusterConfig
 from buster.completers import ChatGPTCompleter, Completer, DocumentAnswerer
 from buster.formatters.documents import DocumentsFormatter
 from buster.formatters.prompts import PromptFormatter
-from buster.retriever import Retriever, SQLiteRetriever
 from buster.tokenizers import GPTTokenizer
 from buster.validators import QuestionAnswerValidator, Validator
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
@@ -18,7 +18,7 @@ logging.basicConfig(level=logging.INFO)
 HUB_TOKEN = os.getenv("HUB_TOKEN")
 REPO_ID = "jerpint/towardsai-buster-data"
-HUB_DB_FILE = "documents.db"
 logger.info(f"Downloading {HUB_DB_FILE} from hub...")
 hf_hub_download(
     repo_id=REPO_ID,
@@ -28,6 +28,8 @@ hf_hub_download(
     local_dir=".",
 )
 buster_cfg = BusterConfig(
     validator_cfg={
@@ -61,7 +63,7 @@ A user will submit a question. Respond 'true' if it is valid, respond 'false' if
         },
     },
     retriever_cfg={
-        "db_path": "./documents.db",
         "top_k": 3,
         "thresh": 0.7,
         "max_tokens": 2000,
@@ -115,7 +117,7 @@ A user will submit a question. Respond 'true' if it is valid, respond 'false' if
 # initialize buster with the config in cfg.py (adapt to your needs) ...
 # buster_cfg = cfg.buster_cfg
-retriever: Retriever = SQLiteRetriever(**buster_cfg.retriever_cfg)
 tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
 document_answerer: DocumentAnswerer = DocumentAnswerer(
     completer=ChatGPTCompleter(**buster_cfg.completion_cfg),

 import logging
+import os
 from buster.busterbot import Buster, BusterConfig
 from buster.completers import ChatGPTCompleter, Completer, DocumentAnswerer
 from buster.formatters.documents import DocumentsFormatter
 from buster.formatters.prompts import PromptFormatter
+from buster.retriever import DeepLakeRetriever, Retriever
 from buster.tokenizers import GPTTokenizer
 from buster.validators import QuestionAnswerValidator, Validator
+from huggingface_hub import hf_hub_download
+from utils import extract_zip
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 HUB_TOKEN = os.getenv("HUB_TOKEN")
 REPO_ID = "jerpint/towardsai-buster-data"
+HUB_DB_FILE = "deeplake_store.zip"
 logger.info(f"Downloading {HUB_DB_FILE} from hub...")
 hf_hub_download(
     repo_id=REPO_ID,
     local_dir=".",
 )
+extract_zip(zip_file_path="deeplake_store.zip", output_path="deeplake_store")
 buster_cfg = BusterConfig(
     validator_cfg={
         },
     },
     retriever_cfg={
+        "path": "./deeplake_store",
         "top_k": 3,
         "thresh": 0.7,
         "max_tokens": 2000,
 # initialize buster with the config in cfg.py (adapt to your needs) ...
 # buster_cfg = cfg.buster_cfg
+retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
 tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
 document_answerer: DocumentAnswerer = DocumentAnswerer(
     completer=ChatGPTCompleter(**buster_cfg.completion_cfg),

embed_documents.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import openai
+import pandas as pd
+from deeplake.core.vectorstore import VectorStore
+from utils import zip_contents
+def embedding_function(texts, model="text-embedding-ada-002"):
+    if isinstance(texts, str):
+        texts = [texts]
+    texts = [t.replace("\n", " ") for t in texts]
+    return [
+        data["embedding"]
+        for data in openai.Embedding.create(input=texts, model=model)["data"]
+    ]
+def extract_metadata(df: pd.DataFrame) -> dict:
+    """extract the metadata from the dataframe in deeplake dict format"""
+    metadata = df.apply(
+        lambda x: {
+            "url": x.url,
+            "source": x.source,
+            "title": x.title,
+        },
+        axis=1,
+    ).to_list()
+    return metadata
+if __name__ == "__main__":
+    vector_store_path = "deeplake_store"
+    chunk_file = "data/chunks_preprocessed.csv"
+    overwrite = True
+    df = pd.read_csv(chunk_file)
+    for col in ["url", "source", "title", "content"]:
+        assert col in df.columns
+    # extract the text + metadata
+    metadata = extract_metadata(df)
+    chunked_text = df.content.to_list()
+    # init the vector store
+    vector_store = VectorStore(
+        path=vector_store_path,
+        overwrite=True,
+    )
+    # add the embeddings
+    vector_store.add(
+        text=chunked_text,
+        embedding_function=embedding_function,
+        embedding_data=chunked_text,
+        metadata=metadata,
+    )
+    # save the deeplake folder to a zip file
+    zipped_file_path = zip_contents(input_path=vector_store_path, output_path=".")
+    print(f"Contents zipped to: {zipped_file_path}")

gradio_app.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import os
-import cfg
 import gradio as gr
 import pandas as pd
-from cfg import buster
-import logging
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
@@ -86,7 +85,7 @@ with block:
             placeholder="Ask a question to AI stackoverflow here...",
             lines=1,
         )
-        submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
     examples = gr.Examples(
         examples=[

+import logging
 import os
 import gradio as gr
 import pandas as pd
+import cfg
+from cfg import buster
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
             placeholder="Ask a question to AI stackoverflow here...",
             lines=1,
         )
+        submit = gr.Button(value="Send", variant="secondary")
     examples = gr.Examples(
         examples=[

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 git+https://github.com/jerpint/[email protected]
 gradio

 git+https://github.com/jerpint/[email protected]
 gradio
+deeplake

utils.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+import zipfile
+def zip_contents(input_path, output_path):
+    """
+    Zips the entire contents of a given path to a custom output path.
+    Authored by ChatGPT
+    Args:
+        input_path (str): The path of the directory to be zipped.
+        output_path (str): The path where the zip file will be created.
+    Returns:
+        str: The path of the created zip file.
+    """
+    if not os.path.exists(input_path):
+        raise ValueError("The specified input path does not exist.")
+    zip_file_name = f"{os.path.basename(input_path)}.zip"
+    zip_file_path = os.path.join(output_path, zip_file_name)
+    with zipfile.ZipFile(zip_file_path, "w", zipfile.ZIP_DEFLATED) as zipf:
+        for root, _, files in os.walk(input_path):
+            for file in files:
+                file_path = os.path.join(root, file)
+                arcname = os.path.relpath(file_path, input_path)
+                zipf.write(file_path, arcname=arcname)
+    return zip_file_path
+def extract_zip(zip_file_path, output_path):
+    """
+    Extracts the contents of a zip file to a custom output path.
+    Authored by ChatGPT
+    Args:
+        zip_file_path (str): The path of the zip file to be extracted.
+        output_path (str): The path where the zip contents will be extracted.
+    Returns:
+        str: The path of the directory where the zip contents are extracted.
+    """
+    if not os.path.exists(zip_file_path):
+        raise ValueError("The specified zip file does not exist.")
+    with zipfile.ZipFile(zip_file_path, "r") as zipf:
+        zipf.extractall(output_path)
+    return output_path