Spaces:
Running
Running
Deeplake support (#1)
Browse files* add deeplake support
* add helper functions to ingest the data via deeplake
* add script to embed documents to deeplake format
- cfg.py +9 -7
- embed_documents.py +61 -0
- gradio_app.py +4 -5
- requirements.txt +1 -0
- utils.py +53 -0
cfg.py
CHANGED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
-
import os
|
| 2 |
import logging
|
| 3 |
-
|
| 4 |
-
from huggingface_hub import hf_hub_download
|
| 5 |
|
| 6 |
from buster.busterbot import Buster, BusterConfig
|
| 7 |
from buster.completers import ChatGPTCompleter, Completer, DocumentAnswerer
|
| 8 |
from buster.formatters.documents import DocumentsFormatter
|
| 9 |
from buster.formatters.prompts import PromptFormatter
|
| 10 |
-
from buster.retriever import
|
| 11 |
from buster.tokenizers import GPTTokenizer
|
| 12 |
from buster.validators import QuestionAnswerValidator, Validator
|
|
|
|
| 13 |
|
|
|
|
| 14 |
|
| 15 |
logger = logging.getLogger(__name__)
|
| 16 |
logging.basicConfig(level=logging.INFO)
|
|
@@ -18,7 +18,7 @@ logging.basicConfig(level=logging.INFO)
|
|
| 18 |
|
| 19 |
HUB_TOKEN = os.getenv("HUB_TOKEN")
|
| 20 |
REPO_ID = "jerpint/towardsai-buster-data"
|
| 21 |
-
HUB_DB_FILE = "
|
| 22 |
logger.info(f"Downloading {HUB_DB_FILE} from hub...")
|
| 23 |
hf_hub_download(
|
| 24 |
repo_id=REPO_ID,
|
|
@@ -28,6 +28,8 @@ hf_hub_download(
|
|
| 28 |
local_dir=".",
|
| 29 |
)
|
| 30 |
|
|
|
|
|
|
|
| 31 |
|
| 32 |
buster_cfg = BusterConfig(
|
| 33 |
validator_cfg={
|
|
@@ -61,7 +63,7 @@ A user will submit a question. Respond 'true' if it is valid, respond 'false' if
|
|
| 61 |
},
|
| 62 |
},
|
| 63 |
retriever_cfg={
|
| 64 |
-
"
|
| 65 |
"top_k": 3,
|
| 66 |
"thresh": 0.7,
|
| 67 |
"max_tokens": 2000,
|
|
@@ -115,7 +117,7 @@ A user will submit a question. Respond 'true' if it is valid, respond 'false' if
|
|
| 115 |
|
| 116 |
# initialize buster with the config in cfg.py (adapt to your needs) ...
|
| 117 |
# buster_cfg = cfg.buster_cfg
|
| 118 |
-
retriever: Retriever =
|
| 119 |
tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
|
| 120 |
document_answerer: DocumentAnswerer = DocumentAnswerer(
|
| 121 |
completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
|
|
|
|
|
|
|
| 1 |
import logging
|
| 2 |
+
import os
|
|
|
|
| 3 |
|
| 4 |
from buster.busterbot import Buster, BusterConfig
|
| 5 |
from buster.completers import ChatGPTCompleter, Completer, DocumentAnswerer
|
| 6 |
from buster.formatters.documents import DocumentsFormatter
|
| 7 |
from buster.formatters.prompts import PromptFormatter
|
| 8 |
+
from buster.retriever import DeepLakeRetriever, Retriever
|
| 9 |
from buster.tokenizers import GPTTokenizer
|
| 10 |
from buster.validators import QuestionAnswerValidator, Validator
|
| 11 |
+
from huggingface_hub import hf_hub_download
|
| 12 |
|
| 13 |
+
from utils import extract_zip
|
| 14 |
|
| 15 |
logger = logging.getLogger(__name__)
|
| 16 |
logging.basicConfig(level=logging.INFO)
|
|
|
|
| 18 |
|
| 19 |
HUB_TOKEN = os.getenv("HUB_TOKEN")
|
| 20 |
REPO_ID = "jerpint/towardsai-buster-data"
|
| 21 |
+
HUB_DB_FILE = "deeplake_store.zip"
|
| 22 |
logger.info(f"Downloading {HUB_DB_FILE} from hub...")
|
| 23 |
hf_hub_download(
|
| 24 |
repo_id=REPO_ID,
|
|
|
|
| 28 |
local_dir=".",
|
| 29 |
)
|
| 30 |
|
| 31 |
+
extract_zip(zip_file_path="deeplake_store.zip", output_path="deeplake_store")
|
| 32 |
+
|
| 33 |
|
| 34 |
buster_cfg = BusterConfig(
|
| 35 |
validator_cfg={
|
|
|
|
| 63 |
},
|
| 64 |
},
|
| 65 |
retriever_cfg={
|
| 66 |
+
"path": "./deeplake_store",
|
| 67 |
"top_k": 3,
|
| 68 |
"thresh": 0.7,
|
| 69 |
"max_tokens": 2000,
|
|
|
|
| 117 |
|
| 118 |
# initialize buster with the config in cfg.py (adapt to your needs) ...
|
| 119 |
# buster_cfg = cfg.buster_cfg
|
| 120 |
+
retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
|
| 121 |
tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
|
| 122 |
document_answerer: DocumentAnswerer = DocumentAnswerer(
|
| 123 |
completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
|
embed_documents.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import openai
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from deeplake.core.vectorstore import VectorStore
|
| 4 |
+
|
| 5 |
+
from utils import zip_contents
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def embedding_function(texts, model="text-embedding-ada-002"):
|
| 9 |
+
if isinstance(texts, str):
|
| 10 |
+
texts = [texts]
|
| 11 |
+
|
| 12 |
+
texts = [t.replace("\n", " ") for t in texts]
|
| 13 |
+
return [
|
| 14 |
+
data["embedding"]
|
| 15 |
+
for data in openai.Embedding.create(input=texts, model=model)["data"]
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def extract_metadata(df: pd.DataFrame) -> dict:
|
| 20 |
+
"""extract the metadata from the dataframe in deeplake dict format"""
|
| 21 |
+
metadata = df.apply(
|
| 22 |
+
lambda x: {
|
| 23 |
+
"url": x.url,
|
| 24 |
+
"source": x.source,
|
| 25 |
+
"title": x.title,
|
| 26 |
+
},
|
| 27 |
+
axis=1,
|
| 28 |
+
).to_list()
|
| 29 |
+
return metadata
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
if __name__ == "__main__":
|
| 33 |
+
vector_store_path = "deeplake_store"
|
| 34 |
+
chunk_file = "data/chunks_preprocessed.csv"
|
| 35 |
+
overwrite = True
|
| 36 |
+
df = pd.read_csv(chunk_file)
|
| 37 |
+
|
| 38 |
+
for col in ["url", "source", "title", "content"]:
|
| 39 |
+
assert col in df.columns
|
| 40 |
+
|
| 41 |
+
# extract the text + metadata
|
| 42 |
+
metadata = extract_metadata(df)
|
| 43 |
+
chunked_text = df.content.to_list()
|
| 44 |
+
|
| 45 |
+
# init the vector store
|
| 46 |
+
vector_store = VectorStore(
|
| 47 |
+
path=vector_store_path,
|
| 48 |
+
overwrite=True,
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# add the embeddings
|
| 52 |
+
vector_store.add(
|
| 53 |
+
text=chunked_text,
|
| 54 |
+
embedding_function=embedding_function,
|
| 55 |
+
embedding_data=chunked_text,
|
| 56 |
+
metadata=metadata,
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# save the deeplake folder to a zip file
|
| 60 |
+
zipped_file_path = zip_contents(input_path=vector_store_path, output_path=".")
|
| 61 |
+
print(f"Contents zipped to: {zipped_file_path}")
|
gradio_app.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
-
import cfg
|
| 4 |
import gradio as gr
|
| 5 |
import pandas as pd
|
| 6 |
-
from cfg import buster
|
| 7 |
-
|
| 8 |
|
| 9 |
-
import
|
|
|
|
| 10 |
|
| 11 |
logger = logging.getLogger(__name__)
|
| 12 |
logging.basicConfig(level=logging.INFO)
|
|
@@ -86,7 +85,7 @@ with block:
|
|
| 86 |
placeholder="Ask a question to AI stackoverflow here...",
|
| 87 |
lines=1,
|
| 88 |
)
|
| 89 |
-
submit = gr.Button(value="Send", variant="secondary")
|
| 90 |
|
| 91 |
examples = gr.Examples(
|
| 92 |
examples=[
|
|
|
|
| 1 |
+
import logging
|
| 2 |
import os
|
| 3 |
|
|
|
|
| 4 |
import gradio as gr
|
| 5 |
import pandas as pd
|
|
|
|
|
|
|
| 6 |
|
| 7 |
+
import cfg
|
| 8 |
+
from cfg import buster
|
| 9 |
|
| 10 |
logger = logging.getLogger(__name__)
|
| 11 |
logging.basicConfig(level=logging.INFO)
|
|
|
|
| 85 |
placeholder="Ask a question to AI stackoverflow here...",
|
| 86 |
lines=1,
|
| 87 |
)
|
| 88 |
+
submit = gr.Button(value="Send", variant="secondary")
|
| 89 |
|
| 90 |
examples = gr.Examples(
|
| 91 |
examples=[
|
requirements.txt
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
git+https://github.com/jerpint/[email protected]
|
| 2 |
gradio
|
|
|
|
|
|
| 1 |
git+https://github.com/jerpint/[email protected]
|
| 2 |
gradio
|
| 3 |
+
deeplake
|
utils.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import zipfile
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def zip_contents(input_path, output_path):
|
| 6 |
+
"""
|
| 7 |
+
Zips the entire contents of a given path to a custom output path.
|
| 8 |
+
|
| 9 |
+
Authored by ChatGPT
|
| 10 |
+
|
| 11 |
+
Args:
|
| 12 |
+
input_path (str): The path of the directory to be zipped.
|
| 13 |
+
output_path (str): The path where the zip file will be created.
|
| 14 |
+
|
| 15 |
+
Returns:
|
| 16 |
+
str: The path of the created zip file.
|
| 17 |
+
"""
|
| 18 |
+
if not os.path.exists(input_path):
|
| 19 |
+
raise ValueError("The specified input path does not exist.")
|
| 20 |
+
|
| 21 |
+
zip_file_name = f"{os.path.basename(input_path)}.zip"
|
| 22 |
+
zip_file_path = os.path.join(output_path, zip_file_name)
|
| 23 |
+
|
| 24 |
+
with zipfile.ZipFile(zip_file_path, "w", zipfile.ZIP_DEFLATED) as zipf:
|
| 25 |
+
for root, _, files in os.walk(input_path):
|
| 26 |
+
for file in files:
|
| 27 |
+
file_path = os.path.join(root, file)
|
| 28 |
+
arcname = os.path.relpath(file_path, input_path)
|
| 29 |
+
zipf.write(file_path, arcname=arcname)
|
| 30 |
+
|
| 31 |
+
return zip_file_path
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def extract_zip(zip_file_path, output_path):
|
| 35 |
+
"""
|
| 36 |
+
Extracts the contents of a zip file to a custom output path.
|
| 37 |
+
|
| 38 |
+
Authored by ChatGPT
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
zip_file_path (str): The path of the zip file to be extracted.
|
| 42 |
+
output_path (str): The path where the zip contents will be extracted.
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
str: The path of the directory where the zip contents are extracted.
|
| 46 |
+
"""
|
| 47 |
+
if not os.path.exists(zip_file_path):
|
| 48 |
+
raise ValueError("The specified zip file does not exist.")
|
| 49 |
+
|
| 50 |
+
with zipfile.ZipFile(zip_file_path, "r") as zipf:
|
| 51 |
+
zipf.extractall(output_path)
|
| 52 |
+
|
| 53 |
+
return output_path
|