Spaces:

NiranjanSathish
/

DrugBot-Retrieval_Based_QA_Chatbot

Runtime error

App Files Files Community

NiranjanSathish commited on Jul 27

Commit

965e103

verified ·

1 Parent(s): fdf12fd

Upload 2 files

Browse files

Files changed (2) hide show

app.py +349 -0
requirements.txt +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,349 @@

+import dotenv
+# Load environment variables from .env file
+dotenv.load_dotenv()
+import streamlit as st
+import os
+import sys
+import pickle
+import numpy as np
+import spacy # Added to explicitly check for spacy model loading
+# --- Custom CSS for reduced whitespace and colors ---
+st.markdown(
+    """
+    <style>
+    /* Reduce top padding for the main Streamlit app container */
+    .stApp {
+        padding-top: 0px; /* Reduced this value to minimize whitespace at the very top */
+        padding-bottom: 20px;
+    }
+    /* Set a subtle background color for the entire page */
+    body {
+        background-color: #f0f8ff; /* AliceBlue - a very light blue */
+        color: #333333; /* Dark gray for text */
+    }
+    /* Style for headers */
+    h1, h2, h3, h4, h5, h6 {
+        color: #1a5276; /* Darker blue for headings */
+    }
+    /* Style for buttons */
+    .stButton>button {
+        background-color: #28a745; /* Green for primary button */
+        color: white;
+        border-radius: 8px;
+        padding: 10px 20px;
+        border: none;
+        box-shadow: 2px 2px 5px rgba(0,0,0,0.2);
+        transition: background-color 0.3s ease;
+    }
+    .stButton>button:hover {
+        background-color: #218838; /* Darker green on hover */
+    }
+    /* Style for text areas and select boxes */
+    .stTextArea textarea, .stSelectbox [data-testid="stSelectbox"] {
+        border-radius: 8px;
+        border: 1px solid #cccccc;
+    }
+    /* Style for info, success, warning, error boxes */
+    .stAlert {
+        border-radius: 8px;
+    }
+    </style>
+    """,
+    unsafe_allow_html=True
+)
+# --- Global message log ---
+# This list will store messages to be displayed in the log expander
+app_messages = []
+def log_message(type, message):
+    """
+    Helper function to append messages to the log list and display them prominently
+    based on their type.
+    """
+    app_messages.append((type, message))
+    if type == "error":
+        st.error(message)
+# Add the 'Scripts' directory to the Python path
+# This allows importing modules like Query_processing, Retrieval, and Answer_Generation
+script_dir = os.path.join(os.path.dirname(__file__), 'Scripts')
+log_message("info", f"Attempting to add '{script_dir}' to Python path.")
+if script_dir not in sys.path:
+    sys.path.append(script_dir)
+    log_message("info", f"'{script_dir}' added to sys.path.")
+else:
+    log_message("info", f"'{script_dir}' already in sys.path.")
+# --- Debugging: Check if script files exist ---
+script_files_to_check = {
+    "Query_processing.py": False,
+    "Retrieval.py": False,
+    "Answer_Generation.py": False
+}
+all_scripts_found = True
+for script_name in script_files_to_check:
+    script_path = os.path.join(script_dir, script_name)
+    if os.path.exists(script_path):
+        script_files_to_check[script_name] = True
+    else:
+        all_scripts_found = False
+        log_message("error", f"Error: Script file not found at expected path: {script_path}")
+if not all_scripts_found:
+    log_message("error", "One or more essential script files are missing from the 'Scripts' directory. "
+             "Please ensure your project structure is correct.")
+    st.stop() # Stop execution if critical files are missing
+# Import your core logic modules
+try:
+    from Query_processing import preprocess_query
+    from Retrieval import Retrieval_averagedQP
+    from Answer_Generation import answer_generation
+    log_message("success", "Core modules imported successfully!")
+except ImportError as e:
+    log_message("error", f"Error importing core modules. Make sure 'Scripts' directory is correctly structured and contains "
+             f"Query_processing.py, Retrieval.py, and Answer_Generation.py. Error: {e}")
+    st.stop()
+# --- Configuration ---
+# Set page configuration for a wider layout
+st.set_page_config(layout="wide", page_title="Drugbot!", page_icon="💊")
+# Define paths to your data and vectors
+# These paths are relative to the app.py location
+DATASET_PATH = os.path.join(os.path.dirname(__file__), 'Datasets', 'flattened_drug_dataset_cleaned.csv')
+VECTORS_DIR = os.path.join(os.path.dirname(__file__), 'Vectors')
+FAISS_INDEX_PATH = os.path.join(VECTORS_DIR, 'faiss_index.idx')
+DOC_METADATA_PATH = os.path.join(VECTORS_DIR, 'doc_metadata.pkl')
+DOC_VECTORS_PATH = os.path.join(VECTORS_DIR, 'doc_vectors.npy')
+# --- Cached Resources ---
+# Use st.cache_resource to load heavy models and data only once
+@st.cache_resource
+def load_all_assets():
+    """
+    Verifies the existence of necessary files and attempts to load core NLP models.
+    This function will be run only once across all user sessions.
+    """
+    with st.spinner("Verifying medical knowledge base and models... This might take a moment."):
+        try:
+            # 1. Check for presence of FAISS and embedding files
+            if not os.path.exists(FAISS_INDEX_PATH):
+                log_message("error", f"Missing FAISS index file: {FAISS_INDEX_PATH}")
+                return False
+            if not os.path.exists(DOC_METADATA_PATH):
+                log_message("error", f"Missing document metadata file: {DOC_METADATA_PATH}")
+                return False
+            if not os.path.exists(DOC_VECTORS_PATH):
+                log_message("error", f"Missing document vectors file: {DOC_VECTORS_PATH}")
+                return False
+            # 2. Attempt to load the SciSpaCy model (if Query_processing doesn't handle it globally)
+            # This is a common point of failure, so we'll explicitly check.
+            # Assuming 'en_core_sci_md' is the model name.
+            try:
+                # If spacy.load() is called multiple times, it might cause issues.
+                # It's better if Query_processing handles its own model loading once.
+                # This check is just to ensure the model is loadable.
+                # nlp = spacy.load("en_core_sci_md")
+                # del nlp # Release the model if it's not needed globally here
+                log_message("info", "SciSpaCy 'en_core_sci_md' model is expected to be loaded by Query_processing.")
+            except OSError:
+                log_message("error", "SciSpaCy 'en_core_sci_md' model not found or linked. "
+                         "Please ensure it's installed correctly (e.g., `pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz`).")
+                return False
+            except Exception as e:
+                log_message("error", f"An unexpected error occurred while checking SciSpaCy model: {e}")
+                return False
+            log_message("success", "Medical knowledge base files verified. Models will be loaded as needed.")
+            return True # Indicate successful verification
+        except Exception as e:
+            log_message("error", f"Failed to verify assets. Please ensure all data and vector files are in their correct paths. Error: {e}")
+            return False
+# Load all assets at the start of the application
+assets_loaded = load_all_assets()
+# --- Title and Header ---
+st.title("💊 DrugBot")
+st.markdown("---")
+# --- Instructions ---
+# This section is already placed directly after the title and horizontal rule.
+st.header("How to Use:")
+st.write(
+    """
+    Welcome to DrugBot - Retrieval based Medical Drug QA Chatbot! You can ask questions about medical drugs, and I will retrieve
+    information from a verified database to provide accurate answers.
+    1.  **Select an example query** from the dropdown or **type your own question** in the text area below.
+    2.  Click the **"Get Answer"** button.
+    3.  Wait for the chatbot to process your query and generate an answer.
+    """
+)
+st.markdown("---")
+# --- Example Queries ---
+st.header("Try These Examples:")
+example_queries = [
+    "Select an example query...",
+    "What is the dosage for Azithromycin?",
+    "What are the side effects of Ibuprofen?",
+    "How should I take Amoxicillin?",
+    "What are the precautions for Warfarin?",
+    "What are the drug interactions for Metformin?",
+    "What is Paracetamol used for?",
+    "Can pregnant women take Aspirin?",
+    "How does Prednisone work?",
+    "What is the recommended dose for children for Tylenol?"
+]
+selected_example = st.selectbox(
+    "Choose a pre-defined question:",
+    example_queries
+)
+user_query = st.text_area(
+    "Or type your question here:",
+    value="" if selected_example == "Select an example query..." else selected_example,
+    height=100,
+    placeholder="e.g., What is the dosage for Azithromycin?"
+)
+# --- Chatbot Interaction ---
+if st.button("Get Answer", type="primary"):
+    if not assets_loaded:
+        log_message("error", "Application assets failed to verify. Please check the console for errors.")
+    elif not user_query.strip():
+        log_message("warning", "Please enter a question or select an example query.")
+    else:
+        # Check for Groq API Key
+        if "GROQ_API_KEY" not in os.environ:
+            log_message("error", "GROQ_API_KEY environment variable not set. Please set it to use the chatbot.")
+        else:
+            with st.spinner("Thinking... Retrieving and generating answer..."):
+                try:
+                    # 1. Preprocess Query
+                    # Query_processing.py should handle its own spacy model loading.
+                    (intent, sub_intent), entities = preprocess_query(user_query)
+                    log_message("info", f"Detected Intent: {intent}, Sub-Intent: {sub_intent}, Entities: {entities}")
+                    # 2. Retrieve Chunks
+                    # Retrieval_averagedQP is expected to load FAISS index and vectors internally.
+                    chunks = Retrieval_averagedQP(user_query, intent, entities)
+                    if not chunks.empty: # Check if chunks DataFrame is not empty
+                        # 3. Generate Answer
+                        answer = answer_generation(user_query, chunks)
+                        log_message("info", f"Generated Answer Content: {answer[:200]}...") # Log first 200 chars
+                        if not answer.strip(): # Check if answer is empty after stripping whitespace
+                            log_message("warning", "Answer generation returned an empty response.")
+                            st.warning("Could not generate a clear answer for this query. Please try rephrasing.")
+                        else:
+                            log_message("success", "Answer generated successfully!")
+                            st.success("Answer:") # Display success message
+                            st.write(answer) # This prints the answer in the main area
+                        with st.expander("See Retrieved Chunks (for debugging/transparency)"):
+                            st.write("Top 3 Retrieved Chunks:")
+                            for i, chunk in enumerate(chunks.head(3).to_dict(orient='records')): # Display top 3 for brevity
+                                st.write(f"**Chunk {i+1}:**")
+                                st.json(chunk) # Use st.json for better display of dict
+                                st.markdown("---")
+                    else:
+                        log_message("warning", "No relevant information found for your query. Please try rephrasing.")
+                except Exception as e:
+                    log_message("error", f"An error occurred while processing your request: {e}")
+                    st.info("Please try again or rephrase your question.") # User-friendly message
+st.markdown("---")
+# --- About Section ---
+st.header("About This Project")
+with st.expander("Learn More About the Medical Drug QA Chatbot"):
+    st.markdown(
+        """
+        This project implements a **Retrieval-Based Question Answering (QA) system** designed to answer user queries
+        about medical drugs. It aims to provide accurate and factually grounded information by retrieving relevant
+        details from a verified database.
+        ### Purpose
+        With the rapid increase in approved medications, ensuring factual accuracy in medical information is critical.
+        Traditional Large Language Models (LLMs) can sometimes "hallucinate" or provide untraceable answers.
+        Our system addresses this by grounding its responses in a curated database, ensuring factual consistency
+        and increasing user trust.
+        ### Methodology
+        The system follows a multi-stage pipeline:
+        1.  **Data Acquisition & Preprocessing:** Information about 2,755 drugs was web-scraped from MayoClinic.com,
+            cleaned, and flattened into a structured CSV dataset.
+        2.  **Embedding Generation:** The dataset content is embedded using the **MiniLM-V6** model, and indexed
+            with **FAISS** (Facebook AI Similarity Search) for efficient similarity-based retrieval.
+        3.  **Query Processing:** User queries undergo **intent and sub-intent classification** (e.g., identifying if
+            the user is asking about "side effects" or "dosage") and **Named Entity Recognition (NER)** using SciSpaCy
+            to improve retrieval precision.
+        4.  **Retrieval Pipeline:**
+            * **Query Vectorization:** The user query is vectorized using MiniLM-V6, incorporating weighted intent vectors.
+            * **Initial Retrieval:** FAISS is used to retrieve the top 10 most similar document chunks.
+            * **Reranking:** The retrieved chunks are then reranked using **Sentence-BioBERT**, which excels at
+                capturing biomedical contexts, significantly improving the relevance of the final selected documents.
+        5.  **Answer Generation:** The top 3 reranked context chunks, along with the original query, are fed to the
+            **LLaMA-4 model** (via Groq API). The LLM is prompted to generate an answer *strictly based on the
+            provided context*, minimizing hallucination.
+        ### Models Used
+        * **MiniLM-L6-v2:** For FAISS-based vector retrieval.
+        * **Sentence-BioBERT:** For reranking candidate chunks.
+        * **LLaMA-4:** For final answer generation (accessed via Groq API).
+        * **SciSpaCy:** For Named Entity Recognition and intent classification.
+        This project was developed by Niranjan Sathish and Hariharan Chandrasekar.
+        """
+    )
+# --- Repository Link Button (Placeholder) ---
+st.markdown("---")
+st.write("### Project Resources")
+st.markdown(
+    """
+    Once the project is hosted, you'll find links to the repository or Hugging Face Space here.
+    """
+)
+# Placeholder for the actual button. You can uncomment and update this later.
+# if st.button("Go to GitHub Repository"):
+#     st.markdown("[GitHub Repository Link](YOUR_GITHUB_REPO_URL_HERE)")
+# if st.button("Go to Hugging Face Space"):
+#     st.markdown("[Hugging Face Space Link](YOUR_HUGGING_FACE_SPACE_URL_HERE)")
+# --- Application Logs Section ---
+st.markdown("---")
+st.header("Application Logs")
+with st.expander("Show/Hide Logs"):
+    if app_messages:
+        for msg_type, msg_content in app_messages:
+            if msg_type == "info":
+                st.info(msg_content)
+            elif msg_type == "success":
+                st.success(msg_content)
+            elif msg_type == "warning":
+                st.warning(msg_content)
+            elif msg_type == "error":
+                st.error(msg_content)
+    else:
+        st.write("No application messages yet.")

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ