Spaces:
Running
Running
| import logging | |
| import os | |
| from buster.busterbot import Buster, BusterConfig | |
| from buster.completers import ChatGPTCompleter, Completer, DocumentAnswerer | |
| from buster.formatters.documents import DocumentsFormatterJSON | |
| from buster.formatters.prompts import PromptFormatter | |
| from buster.retriever import DeepLakeRetriever, Retriever | |
| from buster.tokenizers import GPTTokenizer | |
| from buster.validators import QuestionAnswerValidator, Validator | |
| from huggingface_hub import hf_hub_download | |
| from utils import extract_zip | |
| logger = logging.getLogger(__name__) | |
| logging.basicConfig(level=logging.INFO) | |
| # For authentication | |
| USERNAME = os.getenv("BUSTER_USERNAME") | |
| PASSWORD = os.getenv("BUSTER_PASSWORD") | |
| HUB_TOKEN = os.getenv("HUB_TOKEN") | |
| REPO_ID = os.getenv("HF_DATASET") | |
| # HUB_DB_FILE = "deeplake_store.zip" | |
| DEEPLAKE_DATASET = os.getenv("DEEPLAKE_DATASET", "wiki_tai_langchain") | |
| ZIP_FILE = DEEPLAKE_DATASET + ".zip" | |
| logger.info(f"Downloading {ZIP_FILE} from hub...") | |
| hf_hub_download( | |
| repo_id=REPO_ID, | |
| repo_type="dataset", | |
| filename=ZIP_FILE, | |
| token=HUB_TOKEN, | |
| local_dir=".", | |
| ) | |
| extract_zip(zip_file_path=ZIP_FILE, output_path=DEEPLAKE_DATASET) | |
| example_questions = [ | |
| "What is the LLama model?", | |
| "What is a LLM?", | |
| "What is an embedding?", | |
| ] | |
| buster_cfg = BusterConfig( | |
| validator_cfg={ | |
| "unknown_response_templates": [ | |
| "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?", | |
| ], | |
| "unknown_threshold": 0.85, | |
| "embedding_model": "text-embedding-ada-002", | |
| "use_reranking": True, | |
| "invalid_question_response": "This question does not seem relevant to my current knowledge.", | |
| "check_question_prompt": """You are a chatbot, answering questions about large language models and artificial intelligence. | |
| Users will ask all sorts of questions, and some might be tangentially related. | |
| Users will learn to build LLM-powered apps, with LangChain & Deep Lake among other technologies. | |
| As long as a question is somewhat related to the topic, respond 'true'. If a question is completely unrelated, respond 'false'. | |
| For example: | |
| Q: How can I setup my own chatbot? | |
| true | |
| Q: What is the meaning of life? | |
| false | |
| A user will now submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""", | |
| "completion_kwargs": { | |
| "model": "gpt-3.5-turbo", | |
| "stream": False, | |
| "temperature": 0, | |
| }, | |
| }, | |
| retriever_cfg={ | |
| "path": f"./{DEEPLAKE_DATASET}", | |
| "top_k": 3, | |
| "thresh": 0.7, | |
| "max_tokens": 2000, | |
| "embedding_model": "text-embedding-ada-002", | |
| }, | |
| documents_answerer_cfg={ | |
| "no_documents_message": "No blog posts are available for this question.", | |
| }, | |
| completion_cfg={ | |
| "completion_kwargs": { | |
| "model": "gpt-3.5-turbo", | |
| "stream": True, | |
| "temperature": 0, | |
| }, | |
| }, | |
| tokenizer_cfg={ | |
| "model_name": "gpt-3.5-turbo", | |
| }, | |
| documents_formatter_cfg={ | |
| "max_tokens": 3500, | |
| "columns": ["content", "source", "title"], | |
| }, | |
| prompt_formatter_cfg={ | |
| "max_tokens": 3500, | |
| "text_before_docs": ( | |
| "You are a chatbot assistant answering users' questions about towardsAI content, a blog about applied artificial intelligence (AI)." | |
| "You are provided information found in the <DOCUMENTS> tag. " | |
| "Only respond with infomration inside the <DOCUMENTS> tag. DO NOT use additional information, even if you know the answer. " | |
| "If the answer is in the documentation, summarize it in a helpful way to the user. " | |
| "If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. " | |
| "Here is the information you can use: " | |
| ), | |
| "text_after_docs": ( | |
| "REMEMBER:\n" | |
| "You are a chatbot assistant answering users' questions about towardsAI content, a blog about applied artificial intelligence (AI)." | |
| "You are provided information found in the <DOCUMENTS> tag. " | |
| "Here are the rules you must follow:\n" | |
| "* Only respond with infomration inside the <DOCUMENTS> tag. DO NOT providew additional information, even if you know the answer. " | |
| "* If the answer is in the documentation, summarize it in a helpful way to the user. " | |
| "* If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. " | |
| "* Only summarize the information in the <DOCUMENTS> tag, do not respond otherwise. " | |
| "* Do not refer to the documentation directly, but use the instructions provided within it to answer questions. " | |
| "* Do not reference any links, urls or hyperlinks in your answers.\n" | |
| "* Make sure to format your answers in Markdown format, including code block and snippets.\n" | |
| "* If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n" | |
| "'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the topics I'm trained on. Is there anything else I can assist you with?'" | |
| "For example:\n" | |
| "What is the meaning of life for a qa bot?\n" | |
| "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the topics I'm trained on. Is there anything else I can assist you with?" | |
| "Now answer the following question:\n" | |
| ), | |
| }, | |
| ) | |
| def setup_buster(buster_cfg): | |
| retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg) | |
| tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg) | |
| document_answerer: DocumentAnswerer = DocumentAnswerer( | |
| completer=ChatGPTCompleter(**buster_cfg.completion_cfg), | |
| documents_formatter=DocumentsFormatterJSON( | |
| tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg | |
| ), | |
| prompt_formatter=PromptFormatter( | |
| tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg | |
| ), | |
| **buster_cfg.documents_answerer_cfg, | |
| ) | |
| validator: Validator = QuestionAnswerValidator(**buster_cfg.validator_cfg) | |
| buster: Buster = Buster( | |
| retriever=retriever, document_answerer=document_answerer, validator=validator | |
| ) | |
| return buster | |