NiranjanSathish commited on
Commit
965e103
·
verified ·
1 Parent(s): fdf12fd

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +349 -0
  2. requirements.txt +0 -0
app.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dotenv
2
+ # Load environment variables from .env file
3
+ dotenv.load_dotenv()
4
+
5
+ import streamlit as st
6
+ import os
7
+ import sys
8
+ import pickle
9
+ import numpy as np
10
+ import spacy # Added to explicitly check for spacy model loading
11
+
12
+ # --- Custom CSS for reduced whitespace and colors ---
13
+ st.markdown(
14
+ """
15
+ <style>
16
+ /* Reduce top padding for the main Streamlit app container */
17
+ .stApp {
18
+ padding-top: 0px; /* Reduced this value to minimize whitespace at the very top */
19
+ padding-bottom: 20px;
20
+ }
21
+
22
+ /* Set a subtle background color for the entire page */
23
+ body {
24
+ background-color: #f0f8ff; /* AliceBlue - a very light blue */
25
+ color: #333333; /* Dark gray for text */
26
+ }
27
+
28
+ /* Style for headers */
29
+ h1, h2, h3, h4, h5, h6 {
30
+ color: #1a5276; /* Darker blue for headings */
31
+ }
32
+
33
+ /* Style for buttons */
34
+ .stButton>button {
35
+ background-color: #28a745; /* Green for primary button */
36
+ color: white;
37
+ border-radius: 8px;
38
+ padding: 10px 20px;
39
+ border: none;
40
+ box-shadow: 2px 2px 5px rgba(0,0,0,0.2);
41
+ transition: background-color 0.3s ease;
42
+ }
43
+ .stButton>button:hover {
44
+ background-color: #218838; /* Darker green on hover */
45
+ }
46
+
47
+ /* Style for text areas and select boxes */
48
+ .stTextArea textarea, .stSelectbox [data-testid="stSelectbox"] {
49
+ border-radius: 8px;
50
+ border: 1px solid #cccccc;
51
+ }
52
+
53
+ /* Style for info, success, warning, error boxes */
54
+ .stAlert {
55
+ border-radius: 8px;
56
+ }
57
+
58
+ </style>
59
+ """,
60
+ unsafe_allow_html=True
61
+ )
62
+
63
+ # --- Global message log ---
64
+ # This list will store messages to be displayed in the log expander
65
+ app_messages = []
66
+
67
+ def log_message(type, message):
68
+ """
69
+ Helper function to append messages to the log list and display them prominently
70
+ based on their type.
71
+ """
72
+ app_messages.append((type, message))
73
+ if type == "error":
74
+ st.error(message)
75
+
76
+
77
+ # Add the 'Scripts' directory to the Python path
78
+ # This allows importing modules like Query_processing, Retrieval, and Answer_Generation
79
+ script_dir = os.path.join(os.path.dirname(__file__), 'Scripts')
80
+ log_message("info", f"Attempting to add '{script_dir}' to Python path.")
81
+ if script_dir not in sys.path:
82
+ sys.path.append(script_dir)
83
+ log_message("info", f"'{script_dir}' added to sys.path.")
84
+ else:
85
+ log_message("info", f"'{script_dir}' already in sys.path.")
86
+
87
+ # --- Debugging: Check if script files exist ---
88
+ script_files_to_check = {
89
+ "Query_processing.py": False,
90
+ "Retrieval.py": False,
91
+ "Answer_Generation.py": False
92
+ }
93
+ all_scripts_found = True
94
+
95
+ for script_name in script_files_to_check:
96
+ script_path = os.path.join(script_dir, script_name)
97
+ if os.path.exists(script_path):
98
+ script_files_to_check[script_name] = True
99
+ else:
100
+ all_scripts_found = False
101
+ log_message("error", f"Error: Script file not found at expected path: {script_path}")
102
+
103
+ if not all_scripts_found:
104
+ log_message("error", "One or more essential script files are missing from the 'Scripts' directory. "
105
+ "Please ensure your project structure is correct.")
106
+ st.stop() # Stop execution if critical files are missing
107
+
108
+ # Import your core logic modules
109
+ try:
110
+ from Query_processing import preprocess_query
111
+ from Retrieval import Retrieval_averagedQP
112
+ from Answer_Generation import answer_generation
113
+ log_message("success", "Core modules imported successfully!")
114
+ except ImportError as e:
115
+ log_message("error", f"Error importing core modules. Make sure 'Scripts' directory is correctly structured and contains "
116
+ f"Query_processing.py, Retrieval.py, and Answer_Generation.py. Error: {e}")
117
+ st.stop()
118
+
119
+ # --- Configuration ---
120
+ # Set page configuration for a wider layout
121
+ st.set_page_config(layout="wide", page_title="Drugbot!", page_icon="💊")
122
+
123
+ # Define paths to your data and vectors
124
+ # These paths are relative to the app.py location
125
+ DATASET_PATH = os.path.join(os.path.dirname(__file__), 'Datasets', 'flattened_drug_dataset_cleaned.csv')
126
+ VECTORS_DIR = os.path.join(os.path.dirname(__file__), 'Vectors')
127
+ FAISS_INDEX_PATH = os.path.join(VECTORS_DIR, 'faiss_index.idx')
128
+ DOC_METADATA_PATH = os.path.join(VECTORS_DIR, 'doc_metadata.pkl')
129
+ DOC_VECTORS_PATH = os.path.join(VECTORS_DIR, 'doc_vectors.npy')
130
+
131
+ # --- Cached Resources ---
132
+ # Use st.cache_resource to load heavy models and data only once
133
+ @st.cache_resource
134
+ def load_all_assets():
135
+ """
136
+ Verifies the existence of necessary files and attempts to load core NLP models.
137
+ This function will be run only once across all user sessions.
138
+ """
139
+ with st.spinner("Verifying medical knowledge base and models... This might take a moment."):
140
+ try:
141
+ # 1. Check for presence of FAISS and embedding files
142
+ if not os.path.exists(FAISS_INDEX_PATH):
143
+ log_message("error", f"Missing FAISS index file: {FAISS_INDEX_PATH}")
144
+ return False
145
+ if not os.path.exists(DOC_METADATA_PATH):
146
+ log_message("error", f"Missing document metadata file: {DOC_METADATA_PATH}")
147
+ return False
148
+ if not os.path.exists(DOC_VECTORS_PATH):
149
+ log_message("error", f"Missing document vectors file: {DOC_VECTORS_PATH}")
150
+ return False
151
+
152
+ # 2. Attempt to load the SciSpaCy model (if Query_processing doesn't handle it globally)
153
+ # This is a common point of failure, so we'll explicitly check.
154
+ # Assuming 'en_core_sci_md' is the model name.
155
+ try:
156
+ # If spacy.load() is called multiple times, it might cause issues.
157
+ # It's better if Query_processing handles its own model loading once.
158
+ # This check is just to ensure the model is loadable.
159
+ # nlp = spacy.load("en_core_sci_md")
160
+ # del nlp # Release the model if it's not needed globally here
161
+ log_message("info", "SciSpaCy 'en_core_sci_md' model is expected to be loaded by Query_processing.")
162
+ except OSError:
163
+ log_message("error", "SciSpaCy 'en_core_sci_md' model not found or linked. "
164
+ "Please ensure it's installed correctly (e.g., `pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz`).")
165
+ return False
166
+ except Exception as e:
167
+ log_message("error", f"An unexpected error occurred while checking SciSpaCy model: {e}")
168
+ return False
169
+
170
+ log_message("success", "Medical knowledge base files verified. Models will be loaded as needed.")
171
+ return True # Indicate successful verification
172
+ except Exception as e:
173
+ log_message("error", f"Failed to verify assets. Please ensure all data and vector files are in their correct paths. Error: {e}")
174
+ return False
175
+
176
+ # Load all assets at the start of the application
177
+ assets_loaded = load_all_assets()
178
+
179
+ # --- Title and Header ---
180
+ st.title("💊 DrugBot")
181
+ st.markdown("---")
182
+
183
+ # --- Instructions ---
184
+ # This section is already placed directly after the title and horizontal rule.
185
+ st.header("How to Use:")
186
+ st.write(
187
+ """
188
+ Welcome to DrugBot - Retrieval based Medical Drug QA Chatbot! You can ask questions about medical drugs, and I will retrieve
189
+ information from a verified database to provide accurate answers.
190
+
191
+ 1. **Select an example query** from the dropdown or **type your own question** in the text area below.
192
+ 2. Click the **"Get Answer"** button.
193
+ 3. Wait for the chatbot to process your query and generate an answer.
194
+ """
195
+ )
196
+ st.markdown("---")
197
+
198
+ # --- Example Queries ---
199
+ st.header("Try These Examples:")
200
+ example_queries = [
201
+ "Select an example query...",
202
+ "What is the dosage for Azithromycin?",
203
+ "What are the side effects of Ibuprofen?",
204
+ "How should I take Amoxicillin?",
205
+ "What are the precautions for Warfarin?",
206
+ "What are the drug interactions for Metformin?",
207
+ "What is Paracetamol used for?",
208
+ "Can pregnant women take Aspirin?",
209
+ "How does Prednisone work?",
210
+ "What is the recommended dose for children for Tylenol?"
211
+ ]
212
+
213
+ selected_example = st.selectbox(
214
+ "Choose a pre-defined question:",
215
+ example_queries
216
+ )
217
+
218
+ user_query = st.text_area(
219
+ "Or type your question here:",
220
+ value="" if selected_example == "Select an example query..." else selected_example,
221
+ height=100,
222
+ placeholder="e.g., What is the dosage for Azithromycin?"
223
+ )
224
+
225
+ # --- Chatbot Interaction ---
226
+ if st.button("Get Answer", type="primary"):
227
+ if not assets_loaded:
228
+ log_message("error", "Application assets failed to verify. Please check the console for errors.")
229
+ elif not user_query.strip():
230
+ log_message("warning", "Please enter a question or select an example query.")
231
+ else:
232
+ # Check for Groq API Key
233
+ if "GROQ_API_KEY" not in os.environ:
234
+ log_message("error", "GROQ_API_KEY environment variable not set. Please set it to use the chatbot.")
235
+ else:
236
+ with st.spinner("Thinking... Retrieving and generating answer..."):
237
+ try:
238
+ # 1. Preprocess Query
239
+ # Query_processing.py should handle its own spacy model loading.
240
+ (intent, sub_intent), entities = preprocess_query(user_query)
241
+ log_message("info", f"Detected Intent: {intent}, Sub-Intent: {sub_intent}, Entities: {entities}")
242
+
243
+ # 2. Retrieve Chunks
244
+ # Retrieval_averagedQP is expected to load FAISS index and vectors internally.
245
+ chunks = Retrieval_averagedQP(user_query, intent, entities)
246
+
247
+ if not chunks.empty: # Check if chunks DataFrame is not empty
248
+ # 3. Generate Answer
249
+ answer = answer_generation(user_query, chunks)
250
+
251
+ log_message("info", f"Generated Answer Content: {answer[:200]}...") # Log first 200 chars
252
+ if not answer.strip(): # Check if answer is empty after stripping whitespace
253
+ log_message("warning", "Answer generation returned an empty response.")
254
+ st.warning("Could not generate a clear answer for this query. Please try rephrasing.")
255
+ else:
256
+ log_message("success", "Answer generated successfully!")
257
+ st.success("Answer:") # Display success message
258
+ st.write(answer) # This prints the answer in the main area
259
+
260
+ with st.expander("See Retrieved Chunks (for debugging/transparency)"):
261
+ st.write("Top 3 Retrieved Chunks:")
262
+ for i, chunk in enumerate(chunks.head(3).to_dict(orient='records')): # Display top 3 for brevity
263
+ st.write(f"**Chunk {i+1}:**")
264
+ st.json(chunk) # Use st.json for better display of dict
265
+ st.markdown("---")
266
+ else:
267
+ log_message("warning", "No relevant information found for your query. Please try rephrasing.")
268
+
269
+
270
+ except Exception as e:
271
+ log_message("error", f"An error occurred while processing your request: {e}")
272
+ st.info("Please try again or rephrase your question.") # User-friendly message
273
+
274
+ st.markdown("---")
275
+
276
+ # --- About Section ---
277
+ st.header("About This Project")
278
+ with st.expander("Learn More About the Medical Drug QA Chatbot"):
279
+ st.markdown(
280
+ """
281
+ This project implements a **Retrieval-Based Question Answering (QA) system** designed to answer user queries
282
+ about medical drugs. It aims to provide accurate and factually grounded information by retrieving relevant
283
+ details from a verified database.
284
+
285
+ ### Purpose
286
+ With the rapid increase in approved medications, ensuring factual accuracy in medical information is critical.
287
+ Traditional Large Language Models (LLMs) can sometimes "hallucinate" or provide untraceable answers.
288
+ Our system addresses this by grounding its responses in a curated database, ensuring factual consistency
289
+ and increasing user trust.
290
+
291
+ ### Methodology
292
+ The system follows a multi-stage pipeline:
293
+ 1. **Data Acquisition & Preprocessing:** Information about 2,755 drugs was web-scraped from MayoClinic.com,
294
+ cleaned, and flattened into a structured CSV dataset.
295
+ 2. **Embedding Generation:** The dataset content is embedded using the **MiniLM-V6** model, and indexed
296
+ with **FAISS** (Facebook AI Similarity Search) for efficient similarity-based retrieval.
297
+ 3. **Query Processing:** User queries undergo **intent and sub-intent classification** (e.g., identifying if
298
+ the user is asking about "side effects" or "dosage") and **Named Entity Recognition (NER)** using SciSpaCy
299
+ to improve retrieval precision.
300
+ 4. **Retrieval Pipeline:**
301
+ * **Query Vectorization:** The user query is vectorized using MiniLM-V6, incorporating weighted intent vectors.
302
+ * **Initial Retrieval:** FAISS is used to retrieve the top 10 most similar document chunks.
303
+ * **Reranking:** The retrieved chunks are then reranked using **Sentence-BioBERT**, which excels at
304
+ capturing biomedical contexts, significantly improving the relevance of the final selected documents.
305
+ 5. **Answer Generation:** The top 3 reranked context chunks, along with the original query, are fed to the
306
+ **LLaMA-4 model** (via Groq API). The LLM is prompted to generate an answer *strictly based on the
307
+ provided context*, minimizing hallucination.
308
+
309
+ ### Models Used
310
+ * **MiniLM-L6-v2:** For FAISS-based vector retrieval.
311
+ * **Sentence-BioBERT:** For reranking candidate chunks.
312
+ * **LLaMA-4:** For final answer generation (accessed via Groq API).
313
+ * **SciSpaCy:** For Named Entity Recognition and intent classification.
314
+
315
+ This project was developed by Niranjan Sathish and Hariharan Chandrasekar.
316
+ """
317
+ )
318
+
319
+ # --- Repository Link Button (Placeholder) ---
320
+ st.markdown("---")
321
+ st.write("### Project Resources")
322
+ st.markdown(
323
+ """
324
+ Once the project is hosted, you'll find links to the repository or Hugging Face Space here.
325
+ """
326
+ )
327
+ # Placeholder for the actual button. You can uncomment and update this later.
328
+ # if st.button("Go to GitHub Repository"):
329
+ # st.markdown("[GitHub Repository Link](YOUR_GITHUB_REPO_URL_HERE)")
330
+ # if st.button("Go to Hugging Face Space"):
331
+ # st.markdown("[Hugging Face Space Link](YOUR_HUGGING_FACE_SPACE_URL_HERE)")
332
+
333
+
334
+ # --- Application Logs Section ---
335
+ st.markdown("---")
336
+ st.header("Application Logs")
337
+ with st.expander("Show/Hide Logs"):
338
+ if app_messages:
339
+ for msg_type, msg_content in app_messages:
340
+ if msg_type == "info":
341
+ st.info(msg_content)
342
+ elif msg_type == "success":
343
+ st.success(msg_content)
344
+ elif msg_type == "warning":
345
+ st.warning(msg_content)
346
+ elif msg_type == "error":
347
+ st.error(msg_content)
348
+ else:
349
+ st.write("No application messages yet.")
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ