Spaces:

towardsai-tutors
/

buster

Running

App Files Files Community

jerpint commited on Aug 1, 2023

Commit

51727c4

unverified ·

1 Parent(s): fbf9436

Update prompts (#3)

Browse files

* update prompt

* use buster for adding documents

* refactor

* add README for spaces

* add .gitignore and gitattributes

* install buster from main branch

Files changed (7) hide show

.gitattributes +35 -0
.gitignore +5 -0
README.md +10 -0
cfg.py +51 -35
embed_documents.py +12 -48
gradio_app.py +8 -6
requirements.txt +1 -1

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+*.csv
+*.zip
+deeplake_store/
+.DS_Store
+__pycache__/

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: TowardsAI 🤝 Buster
+emoji: 🤖
+colorFrom: pink
+colorTo: green
+sdk: gradio
+sdk_version: 3.39.0
+app_file: gradio_app.py
+pinned: false
+---

cfg.py CHANGED Viewed

@@ -15,20 +15,27 @@ from utils import extract_zip
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 HUB_TOKEN = os.getenv("HUB_TOKEN")
 REPO_ID = "jerpint/towardsai-buster-data"
 HUB_DB_FILE = "deeplake_store.zip"
-logger.info(f"Downloading {HUB_DB_FILE} from hub...")
-hf_hub_download(
-    repo_id=REPO_ID,
-    repo_type="dataset",
-    filename=HUB_DB_FILE,
-    token=HUB_TOKEN,
-    local_dir=".",
-)
-extract_zip(zip_file_path="deeplake_store.zip", output_path="deeplake_store")
 buster_cfg = BusterConfig(
@@ -90,26 +97,31 @@ A user will submit a question. Respond 'true' if it is valid, respond 'false' if
         "max_tokens": 3500,
         "text_before_docs": (
             "You are a chatbot assistant answering users' questions about towardsAI content, a blog about applied artificial intelligence (AI)."
             "If the answer is in the documentation, summarize it in a helpful way to the user. "
-            "If it isn't, simply reply that you cannot answer the question. "
-            "Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
-            "Here is the documentation: "
             "<DOCUMENTS> "
         ),
         "text_after_docs": (
             "<\DOCUMENTS>\n"
             "REMEMBER:\n"
             "You are a chatbot assistant answering users' questions about towardsAI content, a blog about applied artificial intelligence (AI)."
             "Here are the rules you must follow:\n"
-            "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n"
-            "2) Make sure to format your answers in Markdown format, including code block and snippets.\n"
-            "3) Do not reference any links, urls or hyperlinks in your answers.\n"
-            "4) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
-            "5) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
-            "'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'"
             "For example:\n"
             "What is the meaning of life for a qa bot?\n"
-            "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?"
             "Now answer the following question:\n"
         ),
     },
@@ -117,19 +129,23 @@ A user will submit a question. Respond 'true' if it is valid, respond 'false' if
 # initialize buster with the config in cfg.py (adapt to your needs) ...
 # buster_cfg = cfg.buster_cfg
-retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
-tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
-document_answerer: DocumentAnswerer = DocumentAnswerer(
-    completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
-    documents_formatter=DocumentsFormatter(
-        tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg
-    ),
-    prompt_formatter=PromptFormatter(
-        tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg
-    ),
-    **buster_cfg.documents_answerer_cfg,
-)
-validator: Validator = QuestionAnswerValidator(**buster_cfg.validator_cfg)
-buster: Buster = Buster(
-    retriever=retriever, document_answerer=document_answerer, validator=validator
-)

 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
+# For authentication
+USERNAME = os.getenv("BUSTER_USERNAME")
+PASSWORD = os.getenv("BUSTER_PASSWORD")
 HUB_TOKEN = os.getenv("HUB_TOKEN")
 REPO_ID = "jerpint/towardsai-buster-data"
 HUB_DB_FILE = "deeplake_store.zip"
+if os.path.exists(HUB_DB_FILE):
+    logger.info(f"Using local {HUB_DB_FILE}...")
+else:
+    logger.info(f"Downloading {HUB_DB_FILE} from hub...")
+    hf_hub_download(
+        repo_id=REPO_ID,
+        repo_type="dataset",
+        filename=HUB_DB_FILE,
+        token=HUB_TOKEN,
+        local_dir=".",
+    )
+extract_zip(zip_file_path=HUB_DB_FILE, output_path="deeplake_store")
 buster_cfg = BusterConfig(
         "max_tokens": 3500,
         "text_before_docs": (
             "You are a chatbot assistant answering users' questions about towardsAI content, a blog about applied artificial intelligence (AI)."
+            "You are provided information found in the <DOCUMENTS> tag. "
+            "Only respond with infomration inside the <DOCUMENTS> tag. DO NOT use additional information, even if you know the answer. "
             "If the answer is in the documentation, summarize it in a helpful way to the user. "
+            "If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. "
+            "Here is the information you can use: "
             "<DOCUMENTS> "
         ),
         "text_after_docs": (
             "<\DOCUMENTS>\n"
             "REMEMBER:\n"
             "You are a chatbot assistant answering users' questions about towardsAI content, a blog about applied artificial intelligence (AI)."
+            "You are provided information found in the <DOCUMENTS> tag. "
             "Here are the rules you must follow:\n"
+            "* Only respond with infomration inside the <DOCUMENTS> tag. DO NOT providew additional information, even if you know the answer. "
+            "* If the answer is in the documentation, summarize it in a helpful way to the user. "
+            "* If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. "
+            "* Only summarize the information in the <DOCUMENTS> tag, do not respond otherwise. "
+            "* Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
+            "* Do not reference any links, urls or hyperlinks in your answers.\n"
+            "* Make sure to format your answers in Markdown format, including code block and snippets.\n"
+            "* If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
+            "'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the topics I'm trained on. Is there anything else I can assist you with?'"
             "For example:\n"
             "What is the meaning of life for a qa bot?\n"
+            "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the topics I'm trained on. Is there anything else I can assist you with?"
             "Now answer the following question:\n"
         ),
     },
 # initialize buster with the config in cfg.py (adapt to your needs) ...
 # buster_cfg = cfg.buster_cfg
+def setup_buster(buster_cfg):
+    retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
+    tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
+    document_answerer: DocumentAnswerer = DocumentAnswerer(
+        completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
+        documents_formatter=DocumentsFormatter(
+            tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg
+        ),
+        prompt_formatter=PromptFormatter(
+            tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg
+        ),
+        **buster_cfg.documents_answerer_cfg,
+    )
+    validator: Validator = QuestionAnswerValidator(**buster_cfg.validator_cfg)
+    buster: Buster = Buster(
+        retriever=retriever, document_answerer=document_answerer, validator=validator
+    )
+    return buster

embed_documents.py CHANGED Viewed

@@ -1,61 +1,25 @@
 import openai
 import pandas as pd
-from deeplake.core.vectorstore import VectorStore
 from utils import zip_contents
-def embedding_function(texts, model="text-embedding-ada-002"):
-    if isinstance(texts, str):
-        texts = [texts]
-    texts = [t.replace("\n", " ") for t in texts]
-    return [
-        data["embedding"]
-        for data in openai.Embedding.create(input=texts, model=model)["data"]
-    ]
-def extract_metadata(df: pd.DataFrame) -> dict:
-    """extract the metadata from the dataframe in deeplake dict format"""
-    metadata = df.apply(
-        lambda x: {
-            "url": x.url,
-            "source": x.source,
-            "title": x.title,
-        },
-        axis=1,
-    ).to_list()
-    return metadata
 if __name__ == "__main__":
     vector_store_path = "deeplake_store"
-    chunk_file = "data/chunks_preprocessed.csv"
     overwrite = True
-    df = pd.read_csv(chunk_file)
-    for col in ["url", "source", "title", "content"]:
-        assert col in df.columns
-    # extract the text + metadata
-    metadata = extract_metadata(df)
-    chunked_text = df.content.to_list()
-    # init the vector store
-    vector_store = VectorStore(
-        path=vector_store_path,
-        overwrite=True,
-    )
-    # add the embeddings
-    vector_store.add(
-        text=chunked_text,
-        embedding_function=embedding_function,
-        embedding_data=chunked_text,
-        metadata=metadata,
-    )
-    # save the deeplake folder to a zip file
-    zipped_file_path = zip_contents(input_path=vector_store_path, output_path=".")
     print(f"Contents zipped to: {zipped_file_path}")

 import openai
 import pandas as pd
+from buster.documents import DeepLakeDocumentsManager
 from utils import zip_contents
+def read_csv(filename: str):
+    """Assumes a pre-chunked csv file is provided with expected columns."""
+    df = pd.read_csv(filename)
+    for col in ["url", "source", "title", "content"]:
+        assert col in df.columns
+    return df
 if __name__ == "__main__":
     vector_store_path = "deeplake_store"
+    chunk_file = "data/outputs.csv"
     overwrite = True
+    df = read_csv(chunk_file)
+    dm = DeepLakeDocumentsManager(vector_store_path, overwrite=overwrite)
+    dm.add(df)
+    zipped_file_path = dm.to_zip()
     print(f"Contents zipped to: {zipped_file_path}")

gradio_app.py CHANGED Viewed

@@ -5,18 +5,20 @@ import gradio as gr
 import pandas as pd
 import cfg
-from cfg import buster
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
-USERNAME = os.getenv("BUSTER_USERNAME")
-PASSWORD = os.getenv("BUSTER_PASSWORD")
 def check_auth(username: str, password: str) -> bool:
-    valid_user = username == USERNAME
-    valid_password = password == PASSWORD
     is_auth = valid_user and valid_password
     logger.info(f"Log-in attempted by {username=}. {is_auth=}")
     return is_auth

 import pandas as pd
 import cfg
+from cfg import setup_buster
+buster = setup_buster(cfg.buster_cfg)
+#  suppress httpx logs they are spammy and uninformative
+logging.getLogger("httpx").setLevel(logging.WARNING)
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 def check_auth(username: str, password: str) -> bool:
+    valid_user = username == cfg.USERNAME
+    valid_password = password == cfg.PASSWORD
     is_auth = valid_user and valid_password
     logger.info(f"Log-in attempted by {username=}. {is_auth=}")
     return is_auth

requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
-git+https://github.com/jerpint/buster@v1.0.14
 gradio
 deeplake

+git+https://github.com/jerpint/buster@main
 gradio
 deeplake