Spaces:

LeedsLibraries
/

IamEarth

Running on L4

App Files Files Community

LeedsLibraries commited on Apr 4

Commit

cc84f47

verified ·

1 Parent(s): 170162e

Upload Chatbot WS files

Browse files

Files changed (5) hide show

.gitattributes +2 -0
Chatbots.pdf +3 -0
DeepSeekR1.pdf +3 -0
app.py +1114 -0
requirements.txt +16 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Chatbots.pdf filter=lfs diff=lfs merge=lfs -text
+DeepSeekR1.pdf filter=lfs diff=lfs merge=lfs -text

Chatbots.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e40deb492c8fa092846fa4970a48522900b4fb17e47f4f0bbc5b725fe4278f58
+size 1644160

DeepSeekR1.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a73a44c4adc33d64b30df00f55074e4a28d710250002a67b07ca06729f57575
+size 656741

app.py ADDED Viewed

	@@ -0,0 +1,1114 @@

+import streamlit as st
+import os
+import re
+import torch
+import numpy as np
+from sentence_transformers import SentenceTransformer, util
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+from langchain_huggingface import HuggingFaceEmbeddings
+# Import for setting environment variables
+import os
+# Import for specific HTTP backend config
+from huggingface_hub import HfFolder
+import hashlib
+# Set environment variables for longer timeouts
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+os.environ["HF_HUB_DISABLE_EXPERIMENTAL_WARNING"] = "1"
+os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "0"
+# Ensure NumPy 2.0 compatibility
+np.float_ = np.float64
+# Add these session state variables in the Streamlit app initialization section
+if "question_history" not in st.session_state:
+    st.session_state.question_history = []
+if "answer_history" not in st.session_state:
+    st.session_state.answer_history = []
+if "question_hash_set" not in st.session_state:
+    st.session_state.question_hash_set = set()
+# Streamlit Page Config
+st.set_page_config(page_title="📖 Educational PDF Chatbot", layout="wide")
+# Hugging Face API Details
+HF_API_KEY = st.secrets.get("HF_API_KEY", os.getenv("HF_API_KEY"))
+# Set token if we have it
+if HF_API_KEY:
+    HfFolder.save_token(HF_API_KEY)
+# Model Selection - Updated to use the 8B model
+MODEL_NAME = "Noorhan/mistral-8b-4bit"
+if not HF_API_KEY:
+    st.error("Hugging Face API key is missing! Please set HF_API_KEY in Streamlit secrets or environment variables.")
+    raise ValueError("Hugging Face API key is missing!")
+@st.cache_resource
+def load_quantized_model():
+    """Loads a quantized version of the model."""
+    try:
+        st.info(f"Loading model {MODEL_NAME}, this may take a few minutes...")
+        # Configure quantization
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+        )
+        # Load tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_NAME,
+            token=HF_API_KEY,
+            trust_remote_code=True,
+        )
+        # Load model
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+            quantization_config=quantization_config,
+            device_map="auto",
+            torch_dtype=torch.float16,
+            token=HF_API_KEY,
+        )
+        st.success(f"Model {MODEL_NAME} loaded successfully!")
+        return model, tokenizer
+    except Exception as e:
+        st.error(f"Error loading model: {str(e)}")
+        return None, None
+# Display loading message first
+if "model_loaded" not in st.session_state:
+    st.session_state.model_loaded = False
+    st.info("Initializing model... This may take a few minutes on first load.")
+# Try to load the model
+model, tokenizer = None, None
+if not st.session_state.model_loaded:
+    with st.spinner("Loading model..."):
+        model, tokenizer = load_quantized_model()
+        if model is not None:
+            st.session_state.model_loaded = True
+else:
+    # Use cached model if already loaded
+    model, tokenizer = load_quantized_model()
+# Load Sentence Transformer model for similarity checking
+# Load Sentence Transformer model for similarity checking
+@st.cache_resource
+def load_sentence_model():
+    """Loads sentence transformer model for text similarity with improved error handling."""
+    with st.spinner("Loading similarity model..."):
+        try:
+            # First ensure the model is explicitly downloaded with the HF token
+            from huggingface_hub import hf_hub_download
+            import os
+            model_name = "sentence-transformers/all-MiniLM-L6-v2"
+            # Create cache directory if it doesn't exist
+            cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface")
+            os.makedirs(cache_dir, exist_ok=True)
+            # Try to use the model
+            st.info(f"Attempting to load sentence transformer model: {model_name}")
+            return SentenceTransformer(model_name, token=HF_API_KEY)
+        except (FileNotFoundError, ConnectionError, OSError) as e:
+            st.warning(f"Error loading the primary model: {str(e)}")
+            st.info("Attempting to use a fallback model...")
+            try:
+                # Try a different model as fallback
+                fallback_model = "all-mpnet-base-v2"
+                return SentenceTransformer(f"sentence-transformers/{fallback_model}", token=HF_API_KEY)
+            except Exception as e2:
+                st.error(f"Failed to load fallback model: {str(e2)}")
+                # Last resort - create a simple embedding model
+                st.warning("Using a simplified embedding approach.")
+                # Define a simple class that mimics the SentenceTransformer interface
+                class SimpleEmbedder:
+                    def encode(self, texts, convert_to_tensor=True):
+                        """Simple word-based encoding"""
+                        import numpy as np
+                        import torch
+                        if isinstance(texts, str):
+                            texts = [texts]
+                        # Create simple embeddings (word count vectors)
+                        embeddings = []
+                        for text in texts:
+                            # Simple word frequency vector (very basic!)
+                            words = set(text.lower().split())
+                            embedding = np.zeros(384)  # Match MiniLM dimension
+                            # Use character positions for a deterministic but simple embedding
+                            for i, word in enumerate(words):
+                                for j, char in enumerate(word):
+                                    if i < 384:
+                                        embedding[i] = ord(char) / 255.0
+                            embeddings.append(embedding)
+                        if convert_to_tensor:
+                            return torch.tensor(embeddings)
+                        return np.array(embeddings)
+                return SimpleEmbedder()
+sentence_model = load_sentence_model()
+# Define PDF Files to Process
+PDF_FILES = ["DeepSeekR1.pdf", "Chatbots.pdf"]
+@st.cache_resource
+def load_and_index_pdfs():
+    """Load and process multiple PDFs into a single vector store with source tracking and improved error handling."""
+    try:
+        with st.spinner("Processing PDF documents..."):
+            documents = []
+            for pdf in PDF_FILES:
+                if os.path.exists(pdf):
+                    try:
+                        loader = PyPDFLoader(pdf)
+                        docs = loader.load()
+                        for doc in docs:
+                            doc.metadata["source"] = pdf
+                            if "page" in doc.metadata:
+                                doc.metadata["source"] = f"{pdf} (Page {doc.metadata['page']})"
+                        documents.extend(docs)
+                    except Exception as pdf_error:
+                        st.error(f"Error loading {pdf}: {str(pdf_error)}")
+                else:
+                    st.error(f"Error: {pdf} not found!")
+            if not documents:
+                st.error("No documents were successfully loaded!")
+                return None
+            # Split documents into chunks with error handling
+            try:
+                text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+                splits = text_splitter.split_documents(documents)
+            except Exception as split_error:
+                st.error(f"Error splitting documents: {str(split_error)}")
+                # Fallback to simpler splitting
+                splits = documents
+            # Create embeddings with fallback options
+            try:
+                # Try the primary embedding model
+                st.info("Creating document embeddings...")
+                embeddings = HuggingFaceEmbeddings(
+                    model_name="sentence-transformers/all-MiniLM-L6-v2",
+                    model_kwargs={"token": HF_API_KEY}
+                )
+                # Test the embeddings
+                test_embed = embeddings.embed_query("test")
+                if not test_embed or len(test_embed) == 0:
+                    raise ValueError("Embedding model returned empty embeddings")
+            except Exception as embed_error:
+                st.warning(f"Primary embedding model failed: {str(embed_error)}")
+                st.info("Trying alternative embedding model...")
+                try:
+                    # Try a different model as fallback
+                    embeddings = HuggingFaceEmbeddings(
+                        model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
+                        model_kwargs={"token": HF_API_KEY}
+                    )
+                except Exception as embed_error2:
+                    st.error(f"Fallback embedding model also failed: {str(embed_error2)}")
+                    st.warning("Using a basic embedding model. Search results may be less accurate.")
+                    # Define a custom embedding function as last resort
+                    from langchain.embeddings.base import Embeddings
+                    import numpy as np
+                    class BasicEmbeddings(Embeddings):
+                        def embed_documents(self, texts):
+                            """Create simple embeddings for a list of texts."""
+                            return [self._basic_embed(text) for text in texts]
+                        def embed_query(self, text):
+                            """Create simple embeddings for a query."""
+                            return self._basic_embed(text)
+                        def _basic_embed(self, text):
+                            """Create a simple embedding based on word frequencies."""
+                            # Create a basic word-frequency based embedding
+                            unique_words = set(text.lower().split())
+                            embedding = np.zeros(384)  # Match MiniLM dimension
+                            for i, word in enumerate(unique_words):
+                                hash_val = sum(ord(c) for c in word) % 384
+                                embedding[hash_val] += 1
+                            # Normalize the embedding
+                            norm = np.linalg.norm(embedding)
+                            if norm > 0:
+                                embedding = embedding / norm
+                            return embedding.tolist()
+                    embeddings = BasicEmbeddings()
+            try:
+                # Create vectorstore with error handling
+                vectorstore = Chroma.from_documents(
+                    splits,
+                    embedding=embeddings,
+                    persist_directory="./chroma_db"
+                )
+                return vectorstore.as_retriever(search_kwargs={"k": 5})
+            except Exception as vector_error:
+                st.error(f"Error creating vector store: {str(vector_error)}")
+                return None
+    except Exception as e:
+        st.error(f"Error processing PDFs: {str(e)}")
+        return None
+retriever = load_and_index_pdfs()
+def check_document_relevance(query, documents, min_similarity=0.2):
+    """Check if retrieved documents are truly relevant using semantic similarity with improved error handling."""
+    if not documents:
+        return [], []
+    try:
+        # Encode query
+        query_embedding = sentence_model.encode(query, convert_to_tensor=True)
+        relevant_docs = []
+        relevant_scores = []
+        for doc in documents:
+            try:
+                # Calculate similarity between query and document
+                doc_embedding = sentence_model.encode(doc.page_content, convert_to_tensor=True)
+                # Handle different return types from different models
+                if hasattr(util, "pytorch_cos_sim"):
+                    similarity = util.pytorch_cos_sim(query_embedding, doc_embedding).item()
+                else:
+                    # Fallback to manual cosine similarity calculation
+                    import torch.nn.functional as F
+                    import torch
+                    if not isinstance(query_embedding, torch.Tensor):
+                        query_embedding = torch.tensor(query_embedding)
+                    if not isinstance(doc_embedding, torch.Tensor):
+                        doc_embedding = torch.tensor(doc_embedding)
+                    # Ensure embeddings are properly shaped
+                    if len(query_embedding.shape) == 1:
+                        query_embedding = query_embedding.unsqueeze(0)
+                    if len(doc_embedding.shape) == 1:
+                        doc_embedding = doc_embedding.unsqueeze(0)
+                    # Calculate cosine similarity
+                    similarity = F.cosine_similarity(query_embedding, doc_embedding).item()
+                # Only consider document if similarity exceeds threshold
+                if similarity > min_similarity:
+                    relevant_docs.append(doc)
+                    relevant_scores.append(similarity)
+            except Exception as e:
+                # If similarity calculation fails for this document, skip it
+                print(f"Error calculating similarity for document: {str(e)}")
+                continue
+        # Sort documents by relevance score
+        sorted_pairs = sorted(zip(relevant_docs, relevant_scores), key=lambda x: x[1], reverse=True)
+        # Unzip if any relevant documents exist
+        if sorted_pairs:
+            relevant_docs, relevant_scores = zip(*sorted_pairs)
+            return list(relevant_docs), list(relevant_scores)
+        else:
+            return [], []
+    except Exception as e:
+        # If everything fails, return all documents
+        print(f"Error in relevance check: {str(e)}")
+        return documents, [0.5] * len(documents)  # Assign medium relevance score
+def is_follow_up_request(query):
+    """Determine if the query is asking for more information/elaboration on previous response."""
+    follow_up_patterns = [
+        r'(tell|explain|describe|give).+more',
+        r'(elaborate|clarify|expand)',
+        r'(more|additional) (information|details|explanation)',
+        r'(could|can) you (give|provide) (more|additional)',
+        r'(go|dive) (into|deeper)',
+        r'(explain|elaborate) (this|that|it)',
+        r'(what|how) (do|does|about) (that|this|it)',
+        r'(why|how) (is|are|was|were) (that|this|it)',
+        r'(more|examples)',
+        r'(please|pls)'
+    ]
+    query_lower = query.lower()
+    # Direct check for common follow-up phrases
+    if any(re.search(pattern, query_lower) for pattern in follow_up_patterns):
+        return True
+    # Simple phrases that indicate follow-up
+    follow_up_phrases = [
+        "more", "further", "continue", "go on", "what else", "and", "also", "in addition",
+        "next", "then", "after", "what about", "tell me more", "elaborate", "explain"
+    ]
+    # Check for these phrases
+    for phrase in follow_up_phrases:
+        if phrase in query_lower:
+            return True
+    return False
+# Improved context management function
+def manage_conversation_context(max_history=10):
+    """Maintain a sliding window of conversation history to prevent context overflow."""
+    # Limit the history to the most recent exchanges
+    if len(st.session_state.conversation_context) > max_history * 2:  # Each exchange is 2 entries (Q&A)
+        # Keep the most recent exchanges
+        st.session_state.conversation_context = st.session_state.conversation_context[-max_history * 2:]
+    # Also limit question and answer history
+    if len(st.session_state.question_history) > max_history:
+        st.session_state.question_history = st.session_state.question_history[-max_history:]
+    if len(st.session_state.answer_history) > max_history:
+        st.session_state.answer_history = st.session_state.answer_history[-max_history:]
+# Function to check if a question is new or repeat
+def is_new_question(question):
+    """Check if a question is new by comparing its hash with previously asked questions."""
+    # Normalize the question text (lowercase, remove punctuation)
+    normalized = re.sub(r'[^\w\s]', '', question.lower())
+    # Calculate hash
+    question_hash = hashlib.md5(normalized.encode()).hexdigest()
+    # Check if we've seen this question before
+    if question_hash in st.session_state.question_hash_set:
+        return False
+    # Add to our set of seen questions
+    st.session_state.question_hash_set.add(question_hash)
+    return True
+# Improved function to identify if a query is a follow-up question from our suggested follow-ups
+def is_suggested_follow_up(query):
+    """Check if the query matches one of our previously suggested follow-up questions."""
+    if not query or len(st.session_state.messages) < 2:
+        return False, None
+    # Clean the query
+    clean_query = query.strip().lower().rstrip('?')
+    # Look through recent assistant messages for suggested follow-ups
+    for i, msg in enumerate(reversed(st.session_state.messages)):
+        if msg["role"] == "assistant" and i < 6:  # Only check recent messages
+            follow_up_match = re.search(r'💡 \*\*Follow-up question:\*\* (.*?)$', msg["content"])
+            if follow_up_match:
+                suggested = follow_up_match.group(1).strip().lower().rstrip('?')
+                # Check similarity - exact match or very high similarity
+                if clean_query == suggested:
+                    return True, msg["content"]
+                # Check if they're very similar (e.g., minor rewording)
+                similarity = calculate_text_similarity(clean_query, suggested)
+                if similarity > 0.85:  # High threshold for similarity
+                    return True, msg["content"]
+    return False, None
+# Helper function to calculate text similarity
+def calculate_text_similarity(text1, text2):
+    """Calculate similarity between two text strings."""
+    try:
+        # Use sentence model to calculate similarity
+        embed1 = sentence_model.encode(text1, convert_to_tensor=True)
+        embed2 = sentence_model.encode(text2, convert_to_tensor=True)
+        similarity = util.pytorch_cos_sim(embed1, embed2).item()
+        return similarity
+    except Exception as e:
+        print(f"Error calculating similarity: {e}")
+        return 0.0
+    # Check if this is one of our suggested follow-up questions
+    is_follow_up, previous_content = is_suggested_follow_up(prompt)
+    # If it's a follow-up question we suggested, treat it as a new question
+    if is_follow_up:
+        # We want to answer this as a new query, not elaborate on the previous topic
+        pass
+    # Filter documents by relevance
+    relevant_docs, similarity_scores = check_document_relevance(prompt, context_docs, min_similarity=0.2)
+    # Extract sources
+    sources = set()
+    has_relevant_info = len(relevant_docs) > 0
+    for doc in relevant_docs:
+        if hasattr(doc, "metadata") and "source" in doc.metadata:
+            sources.add(doc.metadata["source"])
+    # If no relevant context was found in the PDFs
+    if not has_relevant_info:
+        # No specific information - generate a simple response
+        answer = generate_no_docs_response(prompt)
+        answer += f"\n\n💡 **Follow-up question:** Would you like to explore a topic from the educational documents instead?"
+        return answer, None, False, "Would you like to explore a topic from the educational documents instead?"
+    # Add the question to our history
+    if is_new_question(prompt):
+        st.session_state.question_history.append(prompt)
+    # Generate response from model
+    raw_response = generate_response_from_model(prompt)
+    # Post-process the response
+    final_response, new_follow_up = post_process_response(raw_response, prompt, ", ".join(sorted(sources)))
+    # Add the answer to our history
+    answer_only = re.sub(r'💡 \*\*Follow-up question:\*\*.*$', '', final_response, flags=re.DOTALL).strip()
+    answer_only = re.sub(r'📌 \*\*Source:\*\*.*$', '', answer_only, flags=re.DOTALL).strip()
+    st.session_state.answer_history.append(answer_only)
+    # Manage context size
+    manage_conversation_context()
+    return final_response, ", ".join(sorted(sources)), False, new_follow_up
+def clean_model_output(raw_response):
+    """Thoroughly clean the model output to remove all prompt instructions and artifacts."""
+    # First pass: Remove common model prefixes
+    if "You are" in raw_response or "I am" in raw_response or "Based on" in raw_response:
+        content_start = None
+        # Look for paragraph breaks after standard prefixes and preambles
+        for pattern in [
+            "The current date is",
+            "headquartered in Paris",
+            "Based on your knowledge",
+            "Based on the information",
+            "Answer this question",
+            "You are an educational",
+            "I am an AI",
+            "As an educational"
+        ]:
+            pattern_loc = raw_response.find(pattern)
+            if pattern_loc > -1:
+                # Find the end of this paragraph or a period
+                para_end = raw_response.find("\n\n", pattern_loc)
+                period_end = raw_response.find(". ", pattern_loc)
+                # Use whichever end we find first (and is valid)
+                if para_end > -1 and period_end > -1:
+                    end_pos = min(para_end, period_end)
+                elif para_end > -1:
+                    end_pos = para_end
+                elif period_end > -1:
+                    end_pos = period_end + 1  # Include the period
+                else:
+                    end_pos = -1
+                if end_pos > -1 and (content_start is None or end_pos > content_start):
+                    content_start = end_pos + 2  # Skip past the end marker
+        # If we found a break point, skip everything before it
+        if content_start and content_start < len(raw_response):
+            raw_response = raw_response[content_start:]
+    # Remove strings that indicate a prompt or instruction
+    prompt_indicators = [
+        "Based on your knowledge, create a response",
+        "Answer this question based ONLY on the information provided below:",
+        "Answer this question:",
+        "Question:",
+        "Information:",
+        "Be concise, educational, and helpful.",
+        "End with a thoughtful follow-up question",
+        "Answer based on",
+        "This means that",
+        "A related follow-up question",
+        "Use this information:",
+        "Based on your knowledge"
+    ]
+    for indicator in prompt_indicators:
+        if indicator in raw_response:
+            start_index = raw_response.find(indicator)
+            # Find end of line or paragraph or sentence
+            end_options = [
+                raw_response.find("\n\n", start_index),
+                raw_response.find("\n", start_index),
+                raw_response.find(". ", start_index)
+            ]
+            # Filter out -1 values and find the closest endpoint
+            end_options = [x for x in end_options if x > -1]
+            if end_options:
+                end_index = min(end_options)
+                if end_index > start_index:
+                    # If it ends with a period, include it
+                    if raw_response[end_index:end_index+2] == ". ":
+                        end_index += 1
+                    raw_response = raw_response[:start_index] + raw_response[end_index+1:]
+            else:
+                # If no endpoint found, just remove the indicator
+                raw_response = raw_response.replace(indicator, "")
+    # Remove lines that start with typical system message indicators
+    lines = raw_response.split("\n")
+    cleaned_lines = []
+    skip_patterns = [
+        "answer this question",
+        "question:",
+        "information:",
+        "you are",
+        "i am",
+        "the current date is",
+        "be concise",
+        "end with",
+        "provide a detailed",
+        "follow-up question",
+        "use this information",
+        "based on your knowledge"
+    ]
+    for line in lines:
+        lower_line = line.lower()
+        if not any(lower_line.startswith(pattern) for pattern in skip_patterns):
+            if not any(pattern in lower_line for pattern in ["based only on", "concise and helpful"]):
+                cleaned_lines.append(line)
+    # Rejoin cleaned lines
+    cleaned_text = "\n".join(cleaned_lines)
+    # Remove any isolated "Information:" or "Related follow-up:"
+    cleaned_text = re.sub(r'(?:^|\n)Information:(?:\n|$)', '\n', cleaned_text)
+    cleaned_text = re.sub(r'(?:^|\n)Question:(?:\n|$)', '\n', cleaned_text)
+    # Remove the follow-up question section
+    follow_up_patterns = [
+        r'Follow-up Question:.*?$',
+        r'Follow-up question:.*?$',
+        r'\*\*Follow-up question:\*\*.*?$',
+        r'\*\*Follow-up Question:\*\*.*?$'
+    ]
+    for pattern in follow_up_patterns:
+        cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.DOTALL)
+    # Remove any trailing system instructions
+    cleaned_text = re.sub(r'\[insert thoughtful follow-up.*?\]', '', cleaned_text, flags=re.DOTALL)
+    # Clean up excessive whitespace
+    cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
+    # Finally, clean up extra spaces and trim
+    return cleaned_text.strip()
+def extract_follow_up_question(context_text, prev_question=None):
+    """Generate a contextually appropriate follow-up question."""
+    # If we already asked a follow-up question, avoid repetition
+    if prev_question and "key differences between early chatbots" in prev_question:
+        return "What are some applications of chatbots in various industries?"
+    # Find keywords in the context to generate a relevant question
+    context_lower = context_text.lower()
+    if "chatbot" in context_lower or "eliza" in context_lower:
+        return "What are some key differences between early chatbots like ELIZA and modern conversational AI systems?"
+    elif "deepseek" in context_lower:
+        return "How does DeepSeek-R1 compare to other large language models in terms of reasoning capabilities?"
+    elif "knowledge distillation" in context_lower:
+        return "What are other techniques besides knowledge distillation that can make large models more efficient?"
+    elif "language model" in context_lower or "model" in context_lower:
+        return "What challenges do researchers face when developing more powerful language models?"
+    elif "reasoning" in context_lower:
+        return "How do reasoning capabilities in AI systems differ from human reasoning processes?"
+    # Default follow-up
+    return "What other aspects of this topic would you like to explore?"
+def is_conversational_input(prompt):
+    """Check if the user input is conversational rather than a document query."""
+    conversational_patterns = [
+        r'^(hi|hello|hey|greetings|howdy)[\s!.?]*$',
+        r'^(how are you|how\'s it going|what\'s up|how do you do)[\s!.?]*$',
+        r'^(good morning|good afternoon|good evening|good night)[\s!.?]*$',
+        r'^(thanks|thank you|thx|ty)[\s!.?]*$',
+        r'^(bye|goodbye|see you|farewell)[\s!.?]*$',
+        r'^(clear|reset|start over|new conversation)[\s!.?]*$'
+    ]
+    prompt_lower = prompt.lower().strip()
+    return any(re.match(pattern, prompt_lower) for pattern in conversational_patterns)
+def generate_conversational_response(prompt):
+    """Generate a friendly conversational response with educational follow-ups."""
+    prompt_lower = prompt.lower().strip()
+    if re.match(r'^(hi|hello|hey|greetings|howdy)[\s!.?]*$', prompt_lower):
+        return "Hello! I'm your educational assistant. I can help you understand concepts from the documents or answer your questions. What would you like to learn about today?", True
+    elif re.match(r'^(how are you|how\'s it going|what\'s up|how do you do)[\s!.?]*$', prompt_lower):
+        return "I'm here and ready to help you learn! What topic from the documents would you like to explore today?", True
+    elif re.match(r'^(good morning|good afternoon|good evening)[\s!.?]*$', prompt_lower):
+        return f"{prompt.capitalize()}! What educational topics are you interested in exploring today?", True
+    elif re.match(r'^(thanks|thank you|thx|ty)[\s!.?]*$', prompt_lower):
+        return "You're welcome! Learning is a journey we take together. Would you like to explore another topic from the documents?", True
+    elif re.match(r'^(bye|goodbye|see you|farewell)[\s!.?]*$', prompt_lower):
+        return "Goodbye! Remember, learning is a lifelong journey. Feel free to return when you have more questions!", False
+    elif re.match(r'^(clear|reset|start over|new conversation)[\s!.?]*$', prompt_lower):
+        return "I'll start a new conversation. Your previous conversation history has been cleared.", True
+    else:
+        return "I'm here to help you learn. What specific topic from the documents would you like to explore?", True
+def detect_conversation_topic_shift(prompt, conversation_history, threshold=0.4):
+    """Detect if the conversation is shifting to a new topic."""
+    if len(conversation_history) < 2:
+        return False, 0.0
+    # Get the average embedding of the last few exchanges (up to 3)
+    recent_exchanges = conversation_history[-min(6, len(conversation_history)):]
+    recent_text = " ".join(recent_exchanges)
+    prompt_embedding = sentence_model.encode(prompt, convert_to_tensor=True)
+    recent_embedding = sentence_model.encode(recent_text, convert_to_tensor=True)
+    similarity = util.pytorch_cos_sim(prompt_embedding, recent_embedding).item()
+    return similarity < threshold, similarity
+def extract_information_from_docs(docs, limit=2000):
+    """Extract information from documents up to a character limit."""
+    extracted_text = ""
+    current_length = 0
+    for doc in docs:
+        if not hasattr(doc, "page_content"):
+            continue
+        if current_length + len(doc.page_content) <= limit:
+            extracted_text += doc.page_content + "\n\n"
+            current_length += len(doc.page_content) + 2
+        else:
+            # Add a partial chunk to reach the limit
+            remaining = limit - current_length
+            if remaining > 100:  # Only add if we can get a meaningful chunk
+                extracted_text += doc.page_content[:remaining] + "..."
+            break
+    return extracted_text.strip()
+def post_process_response(response, prompt, sources=None, prev_follow_up=None):
+    """Format the response with proper source citation and follow-up."""
+    # Clean the response
+    clean_response = clean_model_output(response)
+    # Generate a follow-up question based on the content
+    follow_up = extract_follow_up_question(clean_response, prev_follow_up)
+    # Add source citation if available
+    if sources:
+        clean_response += f"\n\n📌 **Source:** {sources}"
+    # Add the follow-up question
+    clean_response += f"\n\n💡 **Follow-up question:** {follow_up}"
+    return clean_response, follow_up
+# Modified generate_response_from_model function to handle different question types
+def generate_response_from_model(prompt, is_elaboration=False):
+    """Generate a direct response from the model without any document context or content."""
+    if model is None or tokenizer is None:
+        return "Error: Model could not be loaded."
+    # Determine the prompt type
+    if is_elaboration:
+        model_prompt = "Provide more information and details about this topic."
+    else:
+        model_prompt = "Answer this question directly and factually."
+    try:
+        # Generate response
+        with st.spinner("Generating response..."):
+            # Format for model
+            system_message = "You are a helpful educational assistant that provides factual information about topics related to AI, language models, and conversational systems. Answer the question directly without repeating the question."
+            user_message = f"{model_prompt} Question: {prompt}"
+            if hasattr(tokenizer, "apply_chat_template"):
+                messages = [
+                    {"role": "system", "content": system_message},
+                    {"role": "user", "content": user_message}
+                ]
+                inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
+            else:
+                combined_prompt = f"{system_message}\n\nUser: {user_message}"
+                inputs = tokenizer(combined_prompt, return_tensors="pt").to("cuda")
+            # Generate with increased token limit
+            outputs = model.generate(
+                inputs,
+                max_new_tokens=500,
+                temperature=0.7,
+                top_p=0.9,
+                do_sample=True,
+                eos_token_id=tokenizer.eos_token_id,
+                pad_token_id=tokenizer.eos_token_id,
+                repetition_penalty=1.1
+            )
+            # Decode
+            raw_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # More extensive cleaning to avoid including the question in the answer
+            raw_response = re.sub(r'.*?(As an AI|I am an AI|You asked about|In response to your question|Regarding your question)', '', raw_response, flags=re.DOTALL)
+            raw_response = raw_response.lstrip()
+            # Remove any instances where the model repeats the question
+            question_patterns = [
+                f"Question: {re.escape(prompt)}",
+                f"{re.escape(prompt)}",
+                "The question is about",
+                "You asked about"
+            ]
+            for pattern in question_patterns:
+                raw_response = re.sub(pattern, '', raw_response, flags=re.IGNORECASE)
+            return raw_response
+    except Exception as e:
+        st.error(f"Error generating response: {str(e)}")
+        return f"I'm sorry, there was an error generating a response. Error: {str(e)}"
+def generate_no_docs_response(prompt):
+    """Generate a response when no relevant docs are found."""
+    response = "The documents don't contain information about this topic. "
+    # Add a gentle reminder about the scope of the assistant
+    if any(x in prompt.lower() for x in ["recipe", "cooking", "food", "baking",
+                                        "soup", "meal", "ingredient", "dish"]):
+        response += "I'm an educational assistant focused on the documents provided, which don't discuss cooking recipes."
+    else:
+        response += "I'm focused on the educational content in the provided documents."
+    return response
+# Complete process_query function definition
+def process_query(prompt, context_docs):
+    """Process different types of queries appropriately."""
+    # First check if this is a conversational input
+    if is_conversational_input(prompt):
+        response, should_continue = generate_conversational_response(prompt)
+        # Check if this is a reset request
+        if re.match(r'^(clear|reset|start over|new conversation)[\s!.?]*$', prompt.lower().strip()):
+            return response, None, True, None
+        return response, None, False, None
+    # Check if this is one of our suggested follow-up questions
+    is_follow_up, previous_content = is_suggested_follow_up(prompt)
+    # Get previous follow-up question if any
+    prev_follow_up = None
+    if len(st.session_state.messages) > 0:
+        for msg in reversed(st.session_state.messages):
+            if msg["role"] == "assistant":
+                follow_up_match = re.search(r'💡 \*\*Follow-up question:\*\* (.*?)$', msg["content"])
+                if follow_up_match:
+                    prev_follow_up = follow_up_match.group(1)
+                    break
+    # Handle follow-up/elaboration requests specifically
+    if (is_follow_up_request(prompt) or is_follow_up) and len(st.session_state.conversation_context) >= 2:
+        # If it's a follow-up from our suggestions, treat it as a new question
+        if is_follow_up:
+            # This is a suggested follow-up - treat it as a new question
+            pass  # Continue with normal processing
+        else:
+            # This is a user asking for elaboration
+            # Get the previous exchange (original question)
+            original_query = None
+            for i in range(len(st.session_state.conversation_context)-2, -1, -2):
+                if i < len(st.session_state.conversation_context):
+                    original_query = st.session_state.conversation_context[i]
+                    break
+            if not original_query:
+                original_query = st.session_state.conversation_context[-2]  # Fallback
+            # Generate an elaborated response
+            raw_response = generate_response_from_model(original_query, is_elaboration=True)
+            # Get sources from previous response if available
+            sources = None
+            for msg in reversed(st.session_state.messages):
+                if msg["role"] == "assistant":
+                    source_match = re.search(r'📌 \*\*Source:\*\* (.*?)$', msg["content"])
+                    if source_match:
+                        sources = source_match.group(1)
+                        break
+            final_response, new_follow_up = post_process_response(raw_response, original_query, sources=sources, prev_follow_up=prev_follow_up)
+            return final_response, sources, False, new_follow_up
+    # Not a follow-up, process as a new query
+    # Detect topic shift
+    topic_shift_warning = ""
+    if len(st.session_state.conversation_context) >= 4:
+        is_topic_shift, similarity_score = detect_conversation_topic_shift(prompt, st.session_state.conversation_context)
+        if is_topic_shift:
+            topic_shift_warning = "⚠️ It seems you're starting a new topic. I'll try to answer, but keep in mind this is different from what we were discussing. "
+    # Filter documents by relevance
+    relevant_docs, similarity_scores = check_document_relevance(prompt, context_docs, min_similarity=0.2)
+    # Extract sources
+    sources = set()
+    has_relevant_info = len(relevant_docs) > 0
+    for doc in relevant_docs:
+        if hasattr(doc, "metadata") and "source" in doc.metadata:
+            sources.add(doc.metadata["source"])
+    # If no relevant context was found in the PDFs
+    if not has_relevant_info:
+        # No specific information - generate a simple response
+        answer = topic_shift_warning + generate_no_docs_response(prompt)
+        answer += f"\n\n💡 **Follow-up question:** Would you like to explore a topic from the educational documents instead?"
+        return answer, None, False, "Would you like to explore a topic from the educational documents instead?"
+    # Add the question to our history
+    if is_new_question(prompt):
+        st.session_state.question_history.append(prompt)
+    # Generate response from model
+    raw_response = generate_response_from_model(prompt)
+    # Post-process the response
+    final_response, new_follow_up = post_process_response(raw_response, prompt, ", ".join(sorted(sources)), prev_follow_up)
+    # Add topic shift warning if needed
+    if topic_shift_warning:
+        final_response = topic_shift_warning + final_response
+    # Add the answer to our history
+    answer_only = re.sub(r'💡 \*\*Follow-up question:\*\*.*$', '', final_response, flags=re.DOTALL).strip()
+    answer_only = re.sub(r'📌 \*\*Source:\*\*.*$', '', answer_only, flags=re.DOTALL).strip()
+    st.session_state.answer_history.append(answer_only)
+    # Manage context size
+    manage_conversation_context()
+    return final_response, ", ".join(sorted(sources)), False, new_follow_up
+def generate_response(prompt, context_docs, conversation_history):
+    """Generate an educational response with context awareness and follow-up questions."""
+    # Reset flag
+    should_reset = False
+    # Check if this is a conversational input
+    if is_conversational_input(prompt):
+        response, should_continue = generate_conversational_response(prompt)
+        # Check if this is a reset request
+        if re.match(r'^(clear|reset|start over|new conversation)[\s!.?]*$', prompt.lower().strip()):
+            # Set the reset flag
+            should_reset = True
+        return response, None, should_reset, None
+    # Get previous follow-up question if any
+    prev_follow_up = None
+    if len(st.session_state.messages) > 0:
+        for msg in reversed(st.session_state.messages):
+            if msg["role"] == "assistant":
+                follow_up_match = re.search(r'💡 \*\*Follow-up question:\*\* (.*?)$', msg["content"])
+                if follow_up_match:
+                    prev_follow_up = follow_up_match.group(1)
+                    break
+    # Handle follow-up/elaboration requests specifically
+    if is_follow_up_request(prompt) and len(conversation_history) >= 2:
+        # Get the previous exchange
+        prev_query = conversation_history[-2]  # Previous user query
+        prev_answer = conversation_history[-1]  # Previous assistant answer
+        # Generate an elaborated response without document content
+        raw_response = generate_response_from_model(prev_query, is_elaboration=True)
+        final_response, new_follow_up = post_process_response(raw_response, prev_query, sources=None, prev_follow_up=prev_follow_up)
+        return final_response, None, should_reset, new_follow_up
+    # Not a follow-up, process as a new query
+    # Detect topic shift
+    topic_shift_warning = ""
+    if len(conversation_history) >= 4:
+        is_topic_shift, similarity_score = detect_conversation_topic_shift(prompt, conversation_history)
+        if is_topic_shift:
+            topic_shift_warning = "⚠️ It seems you're starting a new topic. I'll try to answer, but keep in mind this is different from what we were discussing. "
+    # Filter documents by relevance
+    relevant_docs, similarity_scores = check_document_relevance(prompt, context_docs, min_similarity=0.2)
+    # Extract sources
+    sources = set()
+    has_relevant_info = len(relevant_docs) > 0
+    for doc in relevant_docs:
+        if hasattr(doc, "metadata") and "source" in doc.metadata:
+            sources.add(doc.metadata["source"])
+    # If no relevant context was found in the PDFs
+    if not has_relevant_info:
+        # No specific information - generate a simple response
+        answer = topic_shift_warning + generate_no_docs_response(prompt)
+        answer += f"\n\n💡 **Follow-up question:** Would you like to explore a topic from the educational documents instead?"
+        return answer, None, should_reset, "Would you like to explore a topic from the educational documents instead?"
+    # Generate response from model - don't include document text to avoid leakage
+    raw_response = generate_response_from_model(prompt)
+    # Post-process the response
+    final_response, new_follow_up = post_process_response(raw_response, prompt, ", ".join(sorted(sources)), prev_follow_up)
+    # Add topic shift warning if needed
+    if topic_shift_warning:
+        final_response = topic_shift_warning + final_response
+    return final_response, ", ".join(sorted(sources)), should_reset, new_follow_up
+# Streamlit App UI
+st.title("📖 Educational PDF Chatbot")
+# Add info section
+st.sidebar.title("System Info")
+st.sidebar.info("Educational Assistant")
+st.sidebar.write("Documents loaded:")
+for pdf in PDF_FILES:
+    st.sidebar.write(f"- {pdf}")
+# Initialize session state for chat history
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+    # Add welcome message
+    welcome_msg = "Hello! I'm your educational assistant. I can help you understand concepts in the documents. What would you like to explore today?"
+    st.session_state.messages.append({"role": "assistant", "content": welcome_msg})
+# Initialize conversation context tracker
+if "conversation_context" not in st.session_state:
+    st.session_state.conversation_context = []
+# Add session state for tracking conversation length for potential warnings
+if "conversation_turns" not in st.session_state:
+    st.session_state.conversation_turns = 0
+# Add session state for tracking follow-up questions to avoid repetition
+if "prev_follow_up" not in st.session_state:
+    st.session_state.prev_follow_up = None
+# Add a button to clear conversation
+col1, col2 = st.columns([4, 1])
+with col2:
+    if st.button("New Conversation"):
+        st.session_state.conversation_context = []
+        st.session_state.conversation_turns = 0
+        st.session_state.messages = []
+        st.session_state.prev_follow_up = None
+        welcome_msg = "Starting a new conversation. What would you like to learn about today?"
+        st.session_state.messages.append({"role": "assistant", "content": welcome_msg})
+        st.rerun()
+if retriever:
+    # Display chat messages
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    # User input
+    if prompt := st.chat_input("What would you like to learn today?"):
+    # Add user message to history
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        st.session_state.conversation_context.append(prompt)
+        st.session_state.conversation_turns += 1
+        with st.chat_message("user"):
+            st.markdown(prompt)
+        # Generate response
+        with st.chat_message("assistant"):
+            with st.spinner("Thinking..."):
+                try:
+                    # Process query
+                    retrieved_docs = retriever.get_relevant_documents(prompt)
+                    answer, sources, should_reset, new_follow_up = process_query(prompt, retrieved_docs)
+                    # Handle conversation reset if needed
+                    if should_reset:
+                        st.session_state.conversation_context = []
+                        st.session_state.conversation_turns = 0
+                        st.session_state.messages = []
+                        st.session_state.question_history = []
+                        st.session_state.answer_history = []
+                        st.session_state.question_hash_set = set()
+                        st.session_state.messages.append({"role": "assistant", "content": answer})
+                        st.rerun()
+                    # Store response in chat history
+                    st.session_state.messages.append({"role": "assistant", "content": answer})
+                    # Store just the answer text without sources and follow-up in conversation context
+                    answer_only = re.sub(r'💡 \*\*Follow-up question:\*\*.*$', '', answer, flags=re.DOTALL).strip()
+                    answer_only = re.sub(r'📌 \*\*Source:\*\*.*$', '', answer_only, flags=re.DOTALL).strip()
+                    st.session_state.conversation_context.append(answer_only)
+                    # Display the formatted response
+                    st.markdown(answer)
+                except Exception as e:
+                    error_msg = f"An error occurred: {str(e)}"
+                    st.error(error_msg)
+                    st.session_state.messages.append({"role": "assistant", "content": error_msg})
+else:
+    st.error("Failed to load document retrieval system.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+streamlit
+langchain
+langchain-core
+langchain_community
+langchain_huggingface
+PyPDF2
+chromadb==0.4.24
+uvicorn
+pymupdf
+pypdf
+python-dotenv
+transformers
+sentence-transformers
+accelerate>=0.26.0
+bitsandbytes>=0.41.1
+sentencepiece