Spaces:
Runtime error
Runtime error
Upload 2 files
Browse files- app.py +349 -0
- requirements.txt +0 -0
app.py
ADDED
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import dotenv
|
2 |
+
# Load environment variables from .env file
|
3 |
+
dotenv.load_dotenv()
|
4 |
+
|
5 |
+
import streamlit as st
|
6 |
+
import os
|
7 |
+
import sys
|
8 |
+
import pickle
|
9 |
+
import numpy as np
|
10 |
+
import spacy # Added to explicitly check for spacy model loading
|
11 |
+
|
12 |
+
# --- Custom CSS for reduced whitespace and colors ---
|
13 |
+
st.markdown(
|
14 |
+
"""
|
15 |
+
<style>
|
16 |
+
/* Reduce top padding for the main Streamlit app container */
|
17 |
+
.stApp {
|
18 |
+
padding-top: 0px; /* Reduced this value to minimize whitespace at the very top */
|
19 |
+
padding-bottom: 20px;
|
20 |
+
}
|
21 |
+
|
22 |
+
/* Set a subtle background color for the entire page */
|
23 |
+
body {
|
24 |
+
background-color: #f0f8ff; /* AliceBlue - a very light blue */
|
25 |
+
color: #333333; /* Dark gray for text */
|
26 |
+
}
|
27 |
+
|
28 |
+
/* Style for headers */
|
29 |
+
h1, h2, h3, h4, h5, h6 {
|
30 |
+
color: #1a5276; /* Darker blue for headings */
|
31 |
+
}
|
32 |
+
|
33 |
+
/* Style for buttons */
|
34 |
+
.stButton>button {
|
35 |
+
background-color: #28a745; /* Green for primary button */
|
36 |
+
color: white;
|
37 |
+
border-radius: 8px;
|
38 |
+
padding: 10px 20px;
|
39 |
+
border: none;
|
40 |
+
box-shadow: 2px 2px 5px rgba(0,0,0,0.2);
|
41 |
+
transition: background-color 0.3s ease;
|
42 |
+
}
|
43 |
+
.stButton>button:hover {
|
44 |
+
background-color: #218838; /* Darker green on hover */
|
45 |
+
}
|
46 |
+
|
47 |
+
/* Style for text areas and select boxes */
|
48 |
+
.stTextArea textarea, .stSelectbox [data-testid="stSelectbox"] {
|
49 |
+
border-radius: 8px;
|
50 |
+
border: 1px solid #cccccc;
|
51 |
+
}
|
52 |
+
|
53 |
+
/* Style for info, success, warning, error boxes */
|
54 |
+
.stAlert {
|
55 |
+
border-radius: 8px;
|
56 |
+
}
|
57 |
+
|
58 |
+
</style>
|
59 |
+
""",
|
60 |
+
unsafe_allow_html=True
|
61 |
+
)
|
62 |
+
|
63 |
+
# --- Global message log ---
|
64 |
+
# This list will store messages to be displayed in the log expander
|
65 |
+
app_messages = []
|
66 |
+
|
67 |
+
def log_message(type, message):
|
68 |
+
"""
|
69 |
+
Helper function to append messages to the log list and display them prominently
|
70 |
+
based on their type.
|
71 |
+
"""
|
72 |
+
app_messages.append((type, message))
|
73 |
+
if type == "error":
|
74 |
+
st.error(message)
|
75 |
+
|
76 |
+
|
77 |
+
# Add the 'Scripts' directory to the Python path
|
78 |
+
# This allows importing modules like Query_processing, Retrieval, and Answer_Generation
|
79 |
+
script_dir = os.path.join(os.path.dirname(__file__), 'Scripts')
|
80 |
+
log_message("info", f"Attempting to add '{script_dir}' to Python path.")
|
81 |
+
if script_dir not in sys.path:
|
82 |
+
sys.path.append(script_dir)
|
83 |
+
log_message("info", f"'{script_dir}' added to sys.path.")
|
84 |
+
else:
|
85 |
+
log_message("info", f"'{script_dir}' already in sys.path.")
|
86 |
+
|
87 |
+
# --- Debugging: Check if script files exist ---
|
88 |
+
script_files_to_check = {
|
89 |
+
"Query_processing.py": False,
|
90 |
+
"Retrieval.py": False,
|
91 |
+
"Answer_Generation.py": False
|
92 |
+
}
|
93 |
+
all_scripts_found = True
|
94 |
+
|
95 |
+
for script_name in script_files_to_check:
|
96 |
+
script_path = os.path.join(script_dir, script_name)
|
97 |
+
if os.path.exists(script_path):
|
98 |
+
script_files_to_check[script_name] = True
|
99 |
+
else:
|
100 |
+
all_scripts_found = False
|
101 |
+
log_message("error", f"Error: Script file not found at expected path: {script_path}")
|
102 |
+
|
103 |
+
if not all_scripts_found:
|
104 |
+
log_message("error", "One or more essential script files are missing from the 'Scripts' directory. "
|
105 |
+
"Please ensure your project structure is correct.")
|
106 |
+
st.stop() # Stop execution if critical files are missing
|
107 |
+
|
108 |
+
# Import your core logic modules
|
109 |
+
try:
|
110 |
+
from Query_processing import preprocess_query
|
111 |
+
from Retrieval import Retrieval_averagedQP
|
112 |
+
from Answer_Generation import answer_generation
|
113 |
+
log_message("success", "Core modules imported successfully!")
|
114 |
+
except ImportError as e:
|
115 |
+
log_message("error", f"Error importing core modules. Make sure 'Scripts' directory is correctly structured and contains "
|
116 |
+
f"Query_processing.py, Retrieval.py, and Answer_Generation.py. Error: {e}")
|
117 |
+
st.stop()
|
118 |
+
|
119 |
+
# --- Configuration ---
|
120 |
+
# Set page configuration for a wider layout
|
121 |
+
st.set_page_config(layout="wide", page_title="Drugbot!", page_icon="💊")
|
122 |
+
|
123 |
+
# Define paths to your data and vectors
|
124 |
+
# These paths are relative to the app.py location
|
125 |
+
DATASET_PATH = os.path.join(os.path.dirname(__file__), 'Datasets', 'flattened_drug_dataset_cleaned.csv')
|
126 |
+
VECTORS_DIR = os.path.join(os.path.dirname(__file__), 'Vectors')
|
127 |
+
FAISS_INDEX_PATH = os.path.join(VECTORS_DIR, 'faiss_index.idx')
|
128 |
+
DOC_METADATA_PATH = os.path.join(VECTORS_DIR, 'doc_metadata.pkl')
|
129 |
+
DOC_VECTORS_PATH = os.path.join(VECTORS_DIR, 'doc_vectors.npy')
|
130 |
+
|
131 |
+
# --- Cached Resources ---
|
132 |
+
# Use st.cache_resource to load heavy models and data only once
|
133 |
+
@st.cache_resource
|
134 |
+
def load_all_assets():
|
135 |
+
"""
|
136 |
+
Verifies the existence of necessary files and attempts to load core NLP models.
|
137 |
+
This function will be run only once across all user sessions.
|
138 |
+
"""
|
139 |
+
with st.spinner("Verifying medical knowledge base and models... This might take a moment."):
|
140 |
+
try:
|
141 |
+
# 1. Check for presence of FAISS and embedding files
|
142 |
+
if not os.path.exists(FAISS_INDEX_PATH):
|
143 |
+
log_message("error", f"Missing FAISS index file: {FAISS_INDEX_PATH}")
|
144 |
+
return False
|
145 |
+
if not os.path.exists(DOC_METADATA_PATH):
|
146 |
+
log_message("error", f"Missing document metadata file: {DOC_METADATA_PATH}")
|
147 |
+
return False
|
148 |
+
if not os.path.exists(DOC_VECTORS_PATH):
|
149 |
+
log_message("error", f"Missing document vectors file: {DOC_VECTORS_PATH}")
|
150 |
+
return False
|
151 |
+
|
152 |
+
# 2. Attempt to load the SciSpaCy model (if Query_processing doesn't handle it globally)
|
153 |
+
# This is a common point of failure, so we'll explicitly check.
|
154 |
+
# Assuming 'en_core_sci_md' is the model name.
|
155 |
+
try:
|
156 |
+
# If spacy.load() is called multiple times, it might cause issues.
|
157 |
+
# It's better if Query_processing handles its own model loading once.
|
158 |
+
# This check is just to ensure the model is loadable.
|
159 |
+
# nlp = spacy.load("en_core_sci_md")
|
160 |
+
# del nlp # Release the model if it's not needed globally here
|
161 |
+
log_message("info", "SciSpaCy 'en_core_sci_md' model is expected to be loaded by Query_processing.")
|
162 |
+
except OSError:
|
163 |
+
log_message("error", "SciSpaCy 'en_core_sci_md' model not found or linked. "
|
164 |
+
"Please ensure it's installed correctly (e.g., `pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz`).")
|
165 |
+
return False
|
166 |
+
except Exception as e:
|
167 |
+
log_message("error", f"An unexpected error occurred while checking SciSpaCy model: {e}")
|
168 |
+
return False
|
169 |
+
|
170 |
+
log_message("success", "Medical knowledge base files verified. Models will be loaded as needed.")
|
171 |
+
return True # Indicate successful verification
|
172 |
+
except Exception as e:
|
173 |
+
log_message("error", f"Failed to verify assets. Please ensure all data and vector files are in their correct paths. Error: {e}")
|
174 |
+
return False
|
175 |
+
|
176 |
+
# Load all assets at the start of the application
|
177 |
+
assets_loaded = load_all_assets()
|
178 |
+
|
179 |
+
# --- Title and Header ---
|
180 |
+
st.title("💊 DrugBot")
|
181 |
+
st.markdown("---")
|
182 |
+
|
183 |
+
# --- Instructions ---
|
184 |
+
# This section is already placed directly after the title and horizontal rule.
|
185 |
+
st.header("How to Use:")
|
186 |
+
st.write(
|
187 |
+
"""
|
188 |
+
Welcome to DrugBot - Retrieval based Medical Drug QA Chatbot! You can ask questions about medical drugs, and I will retrieve
|
189 |
+
information from a verified database to provide accurate answers.
|
190 |
+
|
191 |
+
1. **Select an example query** from the dropdown or **type your own question** in the text area below.
|
192 |
+
2. Click the **"Get Answer"** button.
|
193 |
+
3. Wait for the chatbot to process your query and generate an answer.
|
194 |
+
"""
|
195 |
+
)
|
196 |
+
st.markdown("---")
|
197 |
+
|
198 |
+
# --- Example Queries ---
|
199 |
+
st.header("Try These Examples:")
|
200 |
+
example_queries = [
|
201 |
+
"Select an example query...",
|
202 |
+
"What is the dosage for Azithromycin?",
|
203 |
+
"What are the side effects of Ibuprofen?",
|
204 |
+
"How should I take Amoxicillin?",
|
205 |
+
"What are the precautions for Warfarin?",
|
206 |
+
"What are the drug interactions for Metformin?",
|
207 |
+
"What is Paracetamol used for?",
|
208 |
+
"Can pregnant women take Aspirin?",
|
209 |
+
"How does Prednisone work?",
|
210 |
+
"What is the recommended dose for children for Tylenol?"
|
211 |
+
]
|
212 |
+
|
213 |
+
selected_example = st.selectbox(
|
214 |
+
"Choose a pre-defined question:",
|
215 |
+
example_queries
|
216 |
+
)
|
217 |
+
|
218 |
+
user_query = st.text_area(
|
219 |
+
"Or type your question here:",
|
220 |
+
value="" if selected_example == "Select an example query..." else selected_example,
|
221 |
+
height=100,
|
222 |
+
placeholder="e.g., What is the dosage for Azithromycin?"
|
223 |
+
)
|
224 |
+
|
225 |
+
# --- Chatbot Interaction ---
|
226 |
+
if st.button("Get Answer", type="primary"):
|
227 |
+
if not assets_loaded:
|
228 |
+
log_message("error", "Application assets failed to verify. Please check the console for errors.")
|
229 |
+
elif not user_query.strip():
|
230 |
+
log_message("warning", "Please enter a question or select an example query.")
|
231 |
+
else:
|
232 |
+
# Check for Groq API Key
|
233 |
+
if "GROQ_API_KEY" not in os.environ:
|
234 |
+
log_message("error", "GROQ_API_KEY environment variable not set. Please set it to use the chatbot.")
|
235 |
+
else:
|
236 |
+
with st.spinner("Thinking... Retrieving and generating answer..."):
|
237 |
+
try:
|
238 |
+
# 1. Preprocess Query
|
239 |
+
# Query_processing.py should handle its own spacy model loading.
|
240 |
+
(intent, sub_intent), entities = preprocess_query(user_query)
|
241 |
+
log_message("info", f"Detected Intent: {intent}, Sub-Intent: {sub_intent}, Entities: {entities}")
|
242 |
+
|
243 |
+
# 2. Retrieve Chunks
|
244 |
+
# Retrieval_averagedQP is expected to load FAISS index and vectors internally.
|
245 |
+
chunks = Retrieval_averagedQP(user_query, intent, entities)
|
246 |
+
|
247 |
+
if not chunks.empty: # Check if chunks DataFrame is not empty
|
248 |
+
# 3. Generate Answer
|
249 |
+
answer = answer_generation(user_query, chunks)
|
250 |
+
|
251 |
+
log_message("info", f"Generated Answer Content: {answer[:200]}...") # Log first 200 chars
|
252 |
+
if not answer.strip(): # Check if answer is empty after stripping whitespace
|
253 |
+
log_message("warning", "Answer generation returned an empty response.")
|
254 |
+
st.warning("Could not generate a clear answer for this query. Please try rephrasing.")
|
255 |
+
else:
|
256 |
+
log_message("success", "Answer generated successfully!")
|
257 |
+
st.success("Answer:") # Display success message
|
258 |
+
st.write(answer) # This prints the answer in the main area
|
259 |
+
|
260 |
+
with st.expander("See Retrieved Chunks (for debugging/transparency)"):
|
261 |
+
st.write("Top 3 Retrieved Chunks:")
|
262 |
+
for i, chunk in enumerate(chunks.head(3).to_dict(orient='records')): # Display top 3 for brevity
|
263 |
+
st.write(f"**Chunk {i+1}:**")
|
264 |
+
st.json(chunk) # Use st.json for better display of dict
|
265 |
+
st.markdown("---")
|
266 |
+
else:
|
267 |
+
log_message("warning", "No relevant information found for your query. Please try rephrasing.")
|
268 |
+
|
269 |
+
|
270 |
+
except Exception as e:
|
271 |
+
log_message("error", f"An error occurred while processing your request: {e}")
|
272 |
+
st.info("Please try again or rephrase your question.") # User-friendly message
|
273 |
+
|
274 |
+
st.markdown("---")
|
275 |
+
|
276 |
+
# --- About Section ---
|
277 |
+
st.header("About This Project")
|
278 |
+
with st.expander("Learn More About the Medical Drug QA Chatbot"):
|
279 |
+
st.markdown(
|
280 |
+
"""
|
281 |
+
This project implements a **Retrieval-Based Question Answering (QA) system** designed to answer user queries
|
282 |
+
about medical drugs. It aims to provide accurate and factually grounded information by retrieving relevant
|
283 |
+
details from a verified database.
|
284 |
+
|
285 |
+
### Purpose
|
286 |
+
With the rapid increase in approved medications, ensuring factual accuracy in medical information is critical.
|
287 |
+
Traditional Large Language Models (LLMs) can sometimes "hallucinate" or provide untraceable answers.
|
288 |
+
Our system addresses this by grounding its responses in a curated database, ensuring factual consistency
|
289 |
+
and increasing user trust.
|
290 |
+
|
291 |
+
### Methodology
|
292 |
+
The system follows a multi-stage pipeline:
|
293 |
+
1. **Data Acquisition & Preprocessing:** Information about 2,755 drugs was web-scraped from MayoClinic.com,
|
294 |
+
cleaned, and flattened into a structured CSV dataset.
|
295 |
+
2. **Embedding Generation:** The dataset content is embedded using the **MiniLM-V6** model, and indexed
|
296 |
+
with **FAISS** (Facebook AI Similarity Search) for efficient similarity-based retrieval.
|
297 |
+
3. **Query Processing:** User queries undergo **intent and sub-intent classification** (e.g., identifying if
|
298 |
+
the user is asking about "side effects" or "dosage") and **Named Entity Recognition (NER)** using SciSpaCy
|
299 |
+
to improve retrieval precision.
|
300 |
+
4. **Retrieval Pipeline:**
|
301 |
+
* **Query Vectorization:** The user query is vectorized using MiniLM-V6, incorporating weighted intent vectors.
|
302 |
+
* **Initial Retrieval:** FAISS is used to retrieve the top 10 most similar document chunks.
|
303 |
+
* **Reranking:** The retrieved chunks are then reranked using **Sentence-BioBERT**, which excels at
|
304 |
+
capturing biomedical contexts, significantly improving the relevance of the final selected documents.
|
305 |
+
5. **Answer Generation:** The top 3 reranked context chunks, along with the original query, are fed to the
|
306 |
+
**LLaMA-4 model** (via Groq API). The LLM is prompted to generate an answer *strictly based on the
|
307 |
+
provided context*, minimizing hallucination.
|
308 |
+
|
309 |
+
### Models Used
|
310 |
+
* **MiniLM-L6-v2:** For FAISS-based vector retrieval.
|
311 |
+
* **Sentence-BioBERT:** For reranking candidate chunks.
|
312 |
+
* **LLaMA-4:** For final answer generation (accessed via Groq API).
|
313 |
+
* **SciSpaCy:** For Named Entity Recognition and intent classification.
|
314 |
+
|
315 |
+
This project was developed by Niranjan Sathish and Hariharan Chandrasekar.
|
316 |
+
"""
|
317 |
+
)
|
318 |
+
|
319 |
+
# --- Repository Link Button (Placeholder) ---
|
320 |
+
st.markdown("---")
|
321 |
+
st.write("### Project Resources")
|
322 |
+
st.markdown(
|
323 |
+
"""
|
324 |
+
Once the project is hosted, you'll find links to the repository or Hugging Face Space here.
|
325 |
+
"""
|
326 |
+
)
|
327 |
+
# Placeholder for the actual button. You can uncomment and update this later.
|
328 |
+
# if st.button("Go to GitHub Repository"):
|
329 |
+
# st.markdown("[GitHub Repository Link](YOUR_GITHUB_REPO_URL_HERE)")
|
330 |
+
# if st.button("Go to Hugging Face Space"):
|
331 |
+
# st.markdown("[Hugging Face Space Link](YOUR_HUGGING_FACE_SPACE_URL_HERE)")
|
332 |
+
|
333 |
+
|
334 |
+
# --- Application Logs Section ---
|
335 |
+
st.markdown("---")
|
336 |
+
st.header("Application Logs")
|
337 |
+
with st.expander("Show/Hide Logs"):
|
338 |
+
if app_messages:
|
339 |
+
for msg_type, msg_content in app_messages:
|
340 |
+
if msg_type == "info":
|
341 |
+
st.info(msg_content)
|
342 |
+
elif msg_type == "success":
|
343 |
+
st.success(msg_content)
|
344 |
+
elif msg_type == "warning":
|
345 |
+
st.warning(msg_content)
|
346 |
+
elif msg_type == "error":
|
347 |
+
st.error(msg_content)
|
348 |
+
else:
|
349 |
+
st.write("No application messages yet.")
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|