Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gitignore +2 -0
- README.md +5 -9
- __pycache__/rag.cpython-311.pyc +0 -0
- front_end.py +24 -0
- rag.py +80 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
.venv
|
2 |
+
.env
|
README.md
CHANGED
@@ -1,12 +1,8 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
colorFrom: red
|
5 |
-
colorTo: gray
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
11 |
-
|
12 |
-
|
|
|
1 |
---
|
2 |
+
title: voc_bot
|
3 |
+
app_file: front_end.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
+
sdk_version: 4.42.0
|
|
|
|
|
6 |
---
|
7 |
+
# voc_bot
|
8 |
+
RAG bot on data scraped by data scraping crews
|
__pycache__/rag.cpython-311.pyc
ADDED
Binary file (4.82 kB). View file
|
|
front_end.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from rag import mongo_rag_tool
|
3 |
+
from gradio.themes.base import Base
|
4 |
+
|
5 |
+
# Create an instance of GradIO
|
6 |
+
|
7 |
+
|
8 |
+
with gr.Blocks(theme=Base(), title="Market Research and VOC bot") as demo:
|
9 |
+
gr.Markdown(
|
10 |
+
"""
|
11 |
+
# VOC App using mined data
|
12 |
+
""")
|
13 |
+
textbox = gr.Textbox(label="Enter your Question:")
|
14 |
+
with gr.Row():
|
15 |
+
button = gr.Button("Submit", variant="primary")
|
16 |
+
with gr.Column():
|
17 |
+
output1 = gr.Textbox(lines=1, max_lines=10, label="Answer:")
|
18 |
+
output2 = gr.Textbox(lines=1, max_lines=10, label="Sources:")
|
19 |
+
|
20 |
+
# Call query_data function upon clicking the Submit button
|
21 |
+
|
22 |
+
button.click(mongo_rag_tool, textbox, outputs=[output1, output2])
|
23 |
+
|
24 |
+
demo.launch(share=True)
|
rag.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from langchain_openai import OpenAIEmbeddings
|
3 |
+
from langchain_mongodb.vectorstores import MongoDBAtlasVectorSearch
|
4 |
+
from langchain_core.prompts import PromptTemplate
|
5 |
+
from langchain.chains import RetrievalQA
|
6 |
+
from langchain_openai import ChatOpenAI
|
7 |
+
import logging
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
|
10 |
+
load_dotenv()
|
11 |
+
|
12 |
+
|
13 |
+
INDEX_NAME = "vector_index"
|
14 |
+
DATABASE_NAME = "scraped_data_db"
|
15 |
+
|
16 |
+
def mongo_rag_tool(query: str) -> str:
|
17 |
+
"""
|
18 |
+
This function is used to retrieve documents from a MongoDB database and then use the RAG model to answer the query.
|
19 |
+
The documents that are most semantically close to the query are returned.
|
20 |
+
args:
|
21 |
+
query: str: The query that you want to use to retrieve documents
|
22 |
+
collection_name: str: The name of the collection in the MongoDB database
|
23 |
+
output_filename: str: The name of the output file where the results will be saved
|
24 |
+
returns:
|
25 |
+
str: The answer to the query
|
26 |
+
"""
|
27 |
+
try:
|
28 |
+
collection_name = os.getenv("MONGODB_COLLECTION_NAME")
|
29 |
+
# Connect to the MongoDB database
|
30 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
31 |
+
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key, disallowed_special=(), model="text-embedding-3-small")
|
32 |
+
|
33 |
+
uri = os.getenv("MONGO_CONNECTION_STRING")
|
34 |
+
logging.info("Creating the mongo vector search object")
|
35 |
+
vector_search = MongoDBAtlasVectorSearch.from_connection_string(
|
36 |
+
uri,
|
37 |
+
DATABASE_NAME + "." + collection_name,
|
38 |
+
embeddings,
|
39 |
+
index_name=INDEX_NAME,
|
40 |
+
)
|
41 |
+
|
42 |
+
logging.info("Retrieving the documents and answering the query")
|
43 |
+
# Retrieve the documents that are most semantically close to the query, exclude ones that are less similar than the threshold
|
44 |
+
post_filter = [{"$project": {"_id": 0,"text": 1,"source": 1,"score":1,"embedding":1}}]
|
45 |
+
qa_retriever = vector_search.as_retriever(
|
46 |
+
search_type="mmr",
|
47 |
+
search_kwargs={"k": 10, 'fetch_k':100, "post_filter_pipeline": post_filter},
|
48 |
+
)
|
49 |
+
|
50 |
+
prompt_template = """Use the following pieces of context to answer the question at the end.
|
51 |
+
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
52 |
+
If you know the answer give a comprehensive, detailed and insightful answer.
|
53 |
+
{context}
|
54 |
+
Question: {question}
|
55 |
+
"""
|
56 |
+
PROMPT = PromptTemplate(
|
57 |
+
template=prompt_template, input_variables=["context", "question"]
|
58 |
+
)
|
59 |
+
qa = RetrievalQA.from_chain_type(
|
60 |
+
llm=ChatOpenAI(api_key=openai_api_key, model="gpt-4o", temperature=0.2),
|
61 |
+
chain_type="stuff",
|
62 |
+
retriever=qa_retriever,
|
63 |
+
return_source_documents=True,
|
64 |
+
chain_type_kwargs={"prompt": PROMPT},
|
65 |
+
)
|
66 |
+
|
67 |
+
docs = qa.invoke({"query": query})
|
68 |
+
|
69 |
+
if docs:
|
70 |
+
logging.info("Saving the retrieved documents")
|
71 |
+
sources = docs["source_documents"]
|
72 |
+
|
73 |
+
source_list = [{"content":result.page_content, "source":result.metadata["source"]} for result in sources]
|
74 |
+
formatted_sources = "\n".join([f"Content: {source['content']}\nSource: {source['source']}\n" for source in source_list])
|
75 |
+
return docs["result"], formatted_sources
|
76 |
+
except Exception as e:
|
77 |
+
logging.error(f"An error occurred: {str(e)}")
|
78 |
+
return f"An error occurred: {str(e)}", "An error occurred: {str(e)}"
|
79 |
+
|
80 |
+
#mongo_rag_tool("What do people think about caterpillar vision link fleet management app")
|