LeedsLibraries commited on
Commit
cc84f47
Β·
verified Β·
1 Parent(s): 170162e

Upload Chatbot WS files

Browse files
Files changed (5) hide show
  1. .gitattributes +2 -0
  2. Chatbots.pdf +3 -0
  3. DeepSeekR1.pdf +3 -0
  4. app.py +1114 -0
  5. requirements.txt +16 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Chatbots.pdf filter=lfs diff=lfs merge=lfs -text
37
+ DeepSeekR1.pdf filter=lfs diff=lfs merge=lfs -text
Chatbots.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e40deb492c8fa092846fa4970a48522900b4fb17e47f4f0bbc5b725fe4278f58
3
+ size 1644160
DeepSeekR1.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a73a44c4adc33d64b30df00f55074e4a28d710250002a67b07ca06729f57575
3
+ size 656741
app.py ADDED
@@ -0,0 +1,1114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import re
4
+ import torch
5
+ import numpy as np
6
+ from sentence_transformers import SentenceTransformer, util
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
8
+ from langchain_community.document_loaders import PyPDFLoader
9
+ from langchain.text_splitter import CharacterTextSplitter
10
+ from langchain_community.vectorstores import Chroma
11
+ from langchain_huggingface import HuggingFaceEmbeddings
12
+ # Import for setting environment variables
13
+ import os
14
+ # Import for specific HTTP backend config
15
+ from huggingface_hub import HfFolder
16
+
17
+ import hashlib
18
+
19
+
20
+ # Set environment variables for longer timeouts
21
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
22
+ os.environ["HF_HUB_DISABLE_EXPERIMENTAL_WARNING"] = "1"
23
+ os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "0"
24
+
25
+ # Ensure NumPy 2.0 compatibility
26
+ np.float_ = np.float64
27
+
28
+ # Add these session state variables in the Streamlit app initialization section
29
+ if "question_history" not in st.session_state:
30
+ st.session_state.question_history = []
31
+
32
+ if "answer_history" not in st.session_state:
33
+ st.session_state.answer_history = []
34
+
35
+ if "question_hash_set" not in st.session_state:
36
+ st.session_state.question_hash_set = set()
37
+
38
+ # Streamlit Page Config
39
+ st.set_page_config(page_title="πŸ“– Educational PDF Chatbot", layout="wide")
40
+
41
+ # Hugging Face API Details
42
+ HF_API_KEY = st.secrets.get("HF_API_KEY", os.getenv("HF_API_KEY"))
43
+
44
+ # Set token if we have it
45
+ if HF_API_KEY:
46
+ HfFolder.save_token(HF_API_KEY)
47
+
48
+ # Model Selection - Updated to use the 8B model
49
+ MODEL_NAME = "Noorhan/mistral-8b-4bit"
50
+
51
+ if not HF_API_KEY:
52
+ st.error("Hugging Face API key is missing! Please set HF_API_KEY in Streamlit secrets or environment variables.")
53
+ raise ValueError("Hugging Face API key is missing!")
54
+
55
+ @st.cache_resource
56
+ def load_quantized_model():
57
+ """Loads a quantized version of the model."""
58
+ try:
59
+ st.info(f"Loading model {MODEL_NAME}, this may take a few minutes...")
60
+
61
+ # Configure quantization
62
+ quantization_config = BitsAndBytesConfig(
63
+ load_in_4bit=True,
64
+ bnb_4bit_compute_dtype=torch.float16,
65
+ bnb_4bit_quant_type="nf4",
66
+ bnb_4bit_use_double_quant=True,
67
+ )
68
+
69
+ # Load tokenizer
70
+ tokenizer = AutoTokenizer.from_pretrained(
71
+ MODEL_NAME,
72
+ token=HF_API_KEY,
73
+ trust_remote_code=True,
74
+ )
75
+
76
+ # Load model
77
+ model = AutoModelForCausalLM.from_pretrained(
78
+ MODEL_NAME,
79
+ quantization_config=quantization_config,
80
+ device_map="auto",
81
+ torch_dtype=torch.float16,
82
+ token=HF_API_KEY,
83
+ )
84
+
85
+ st.success(f"Model {MODEL_NAME} loaded successfully!")
86
+ return model, tokenizer
87
+ except Exception as e:
88
+ st.error(f"Error loading model: {str(e)}")
89
+ return None, None
90
+
91
+ # Display loading message first
92
+ if "model_loaded" not in st.session_state:
93
+ st.session_state.model_loaded = False
94
+ st.info("Initializing model... This may take a few minutes on first load.")
95
+
96
+ # Try to load the model
97
+ model, tokenizer = None, None
98
+ if not st.session_state.model_loaded:
99
+ with st.spinner("Loading model..."):
100
+ model, tokenizer = load_quantized_model()
101
+ if model is not None:
102
+ st.session_state.model_loaded = True
103
+ else:
104
+ # Use cached model if already loaded
105
+ model, tokenizer = load_quantized_model()
106
+
107
+ # Load Sentence Transformer model for similarity checking
108
+ # Load Sentence Transformer model for similarity checking
109
+ @st.cache_resource
110
+ def load_sentence_model():
111
+ """Loads sentence transformer model for text similarity with improved error handling."""
112
+ with st.spinner("Loading similarity model..."):
113
+ try:
114
+ # First ensure the model is explicitly downloaded with the HF token
115
+ from huggingface_hub import hf_hub_download
116
+ import os
117
+
118
+ model_name = "sentence-transformers/all-MiniLM-L6-v2"
119
+
120
+ # Create cache directory if it doesn't exist
121
+ cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface")
122
+ os.makedirs(cache_dir, exist_ok=True)
123
+
124
+ # Try to use the model
125
+ st.info(f"Attempting to load sentence transformer model: {model_name}")
126
+ return SentenceTransformer(model_name, token=HF_API_KEY)
127
+
128
+ except (FileNotFoundError, ConnectionError, OSError) as e:
129
+ st.warning(f"Error loading the primary model: {str(e)}")
130
+ st.info("Attempting to use a fallback model...")
131
+
132
+ try:
133
+ # Try a different model as fallback
134
+ fallback_model = "all-mpnet-base-v2"
135
+ return SentenceTransformer(f"sentence-transformers/{fallback_model}", token=HF_API_KEY)
136
+ except Exception as e2:
137
+ st.error(f"Failed to load fallback model: {str(e2)}")
138
+
139
+ # Last resort - create a simple embedding model
140
+ st.warning("Using a simplified embedding approach.")
141
+
142
+ # Define a simple class that mimics the SentenceTransformer interface
143
+ class SimpleEmbedder:
144
+ def encode(self, texts, convert_to_tensor=True):
145
+ """Simple word-based encoding"""
146
+ import numpy as np
147
+ import torch
148
+
149
+ if isinstance(texts, str):
150
+ texts = [texts]
151
+
152
+ # Create simple embeddings (word count vectors)
153
+ embeddings = []
154
+ for text in texts:
155
+ # Simple word frequency vector (very basic!)
156
+ words = set(text.lower().split())
157
+ embedding = np.zeros(384) # Match MiniLM dimension
158
+
159
+ # Use character positions for a deterministic but simple embedding
160
+ for i, word in enumerate(words):
161
+ for j, char in enumerate(word):
162
+ if i < 384:
163
+ embedding[i] = ord(char) / 255.0
164
+
165
+ embeddings.append(embedding)
166
+
167
+ if convert_to_tensor:
168
+ return torch.tensor(embeddings)
169
+ return np.array(embeddings)
170
+
171
+ return SimpleEmbedder()
172
+ sentence_model = load_sentence_model()
173
+
174
+ # Define PDF Files to Process
175
+ PDF_FILES = ["DeepSeekR1.pdf", "Chatbots.pdf"]
176
+
177
+ @st.cache_resource
178
+ def load_and_index_pdfs():
179
+ """Load and process multiple PDFs into a single vector store with source tracking and improved error handling."""
180
+ try:
181
+ with st.spinner("Processing PDF documents..."):
182
+ documents = []
183
+ for pdf in PDF_FILES:
184
+ if os.path.exists(pdf):
185
+ try:
186
+ loader = PyPDFLoader(pdf)
187
+ docs = loader.load()
188
+
189
+ for doc in docs:
190
+ doc.metadata["source"] = pdf
191
+ if "page" in doc.metadata:
192
+ doc.metadata["source"] = f"{pdf} (Page {doc.metadata['page']})"
193
+
194
+ documents.extend(docs)
195
+ except Exception as pdf_error:
196
+ st.error(f"Error loading {pdf}: {str(pdf_error)}")
197
+ else:
198
+ st.error(f"Error: {pdf} not found!")
199
+
200
+ if not documents:
201
+ st.error("No documents were successfully loaded!")
202
+ return None
203
+
204
+ # Split documents into chunks with error handling
205
+ try:
206
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
207
+ splits = text_splitter.split_documents(documents)
208
+ except Exception as split_error:
209
+ st.error(f"Error splitting documents: {str(split_error)}")
210
+ # Fallback to simpler splitting
211
+ splits = documents
212
+
213
+ # Create embeddings with fallback options
214
+ try:
215
+ # Try the primary embedding model
216
+ st.info("Creating document embeddings...")
217
+ embeddings = HuggingFaceEmbeddings(
218
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
219
+ model_kwargs={"token": HF_API_KEY}
220
+ )
221
+
222
+ # Test the embeddings
223
+ test_embed = embeddings.embed_query("test")
224
+ if not test_embed or len(test_embed) == 0:
225
+ raise ValueError("Embedding model returned empty embeddings")
226
+
227
+ except Exception as embed_error:
228
+ st.warning(f"Primary embedding model failed: {str(embed_error)}")
229
+ st.info("Trying alternative embedding model...")
230
+
231
+ try:
232
+ # Try a different model as fallback
233
+ embeddings = HuggingFaceEmbeddings(
234
+ model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
235
+ model_kwargs={"token": HF_API_KEY}
236
+ )
237
+ except Exception as embed_error2:
238
+ st.error(f"Fallback embedding model also failed: {str(embed_error2)}")
239
+ st.warning("Using a basic embedding model. Search results may be less accurate.")
240
+
241
+ # Define a custom embedding function as last resort
242
+ from langchain.embeddings.base import Embeddings
243
+ import numpy as np
244
+
245
+ class BasicEmbeddings(Embeddings):
246
+ def embed_documents(self, texts):
247
+ """Create simple embeddings for a list of texts."""
248
+ return [self._basic_embed(text) for text in texts]
249
+
250
+ def embed_query(self, text):
251
+ """Create simple embeddings for a query."""
252
+ return self._basic_embed(text)
253
+
254
+ def _basic_embed(self, text):
255
+ """Create a simple embedding based on word frequencies."""
256
+ # Create a basic word-frequency based embedding
257
+ unique_words = set(text.lower().split())
258
+ embedding = np.zeros(384) # Match MiniLM dimension
259
+
260
+ for i, word in enumerate(unique_words):
261
+ hash_val = sum(ord(c) for c in word) % 384
262
+ embedding[hash_val] += 1
263
+
264
+ # Normalize the embedding
265
+ norm = np.linalg.norm(embedding)
266
+ if norm > 0:
267
+ embedding = embedding / norm
268
+
269
+ return embedding.tolist()
270
+
271
+ embeddings = BasicEmbeddings()
272
+
273
+ try:
274
+ # Create vectorstore with error handling
275
+ vectorstore = Chroma.from_documents(
276
+ splits,
277
+ embedding=embeddings,
278
+ persist_directory="./chroma_db"
279
+ )
280
+
281
+ return vectorstore.as_retriever(search_kwargs={"k": 5})
282
+
283
+ except Exception as vector_error:
284
+ st.error(f"Error creating vector store: {str(vector_error)}")
285
+ return None
286
+
287
+ except Exception as e:
288
+ st.error(f"Error processing PDFs: {str(e)}")
289
+ return None
290
+ retriever = load_and_index_pdfs()
291
+
292
+ def check_document_relevance(query, documents, min_similarity=0.2):
293
+ """Check if retrieved documents are truly relevant using semantic similarity with improved error handling."""
294
+ if not documents:
295
+ return [], []
296
+
297
+ try:
298
+ # Encode query
299
+ query_embedding = sentence_model.encode(query, convert_to_tensor=True)
300
+
301
+ relevant_docs = []
302
+ relevant_scores = []
303
+
304
+ for doc in documents:
305
+ try:
306
+ # Calculate similarity between query and document
307
+ doc_embedding = sentence_model.encode(doc.page_content, convert_to_tensor=True)
308
+
309
+ # Handle different return types from different models
310
+ if hasattr(util, "pytorch_cos_sim"):
311
+ similarity = util.pytorch_cos_sim(query_embedding, doc_embedding).item()
312
+ else:
313
+ # Fallback to manual cosine similarity calculation
314
+ import torch.nn.functional as F
315
+ import torch
316
+
317
+ if not isinstance(query_embedding, torch.Tensor):
318
+ query_embedding = torch.tensor(query_embedding)
319
+ if not isinstance(doc_embedding, torch.Tensor):
320
+ doc_embedding = torch.tensor(doc_embedding)
321
+
322
+ # Ensure embeddings are properly shaped
323
+ if len(query_embedding.shape) == 1:
324
+ query_embedding = query_embedding.unsqueeze(0)
325
+ if len(doc_embedding.shape) == 1:
326
+ doc_embedding = doc_embedding.unsqueeze(0)
327
+
328
+ # Calculate cosine similarity
329
+ similarity = F.cosine_similarity(query_embedding, doc_embedding).item()
330
+
331
+ # Only consider document if similarity exceeds threshold
332
+ if similarity > min_similarity:
333
+ relevant_docs.append(doc)
334
+ relevant_scores.append(similarity)
335
+ except Exception as e:
336
+ # If similarity calculation fails for this document, skip it
337
+ print(f"Error calculating similarity for document: {str(e)}")
338
+ continue
339
+
340
+ # Sort documents by relevance score
341
+ sorted_pairs = sorted(zip(relevant_docs, relevant_scores), key=lambda x: x[1], reverse=True)
342
+
343
+ # Unzip if any relevant documents exist
344
+ if sorted_pairs:
345
+ relevant_docs, relevant_scores = zip(*sorted_pairs)
346
+ return list(relevant_docs), list(relevant_scores)
347
+ else:
348
+ return [], []
349
+
350
+ except Exception as e:
351
+ # If everything fails, return all documents
352
+ print(f"Error in relevance check: {str(e)}")
353
+ return documents, [0.5] * len(documents) # Assign medium relevance score
354
+ def is_follow_up_request(query):
355
+ """Determine if the query is asking for more information/elaboration on previous response."""
356
+ follow_up_patterns = [
357
+ r'(tell|explain|describe|give).+more',
358
+ r'(elaborate|clarify|expand)',
359
+ r'(more|additional) (information|details|explanation)',
360
+ r'(could|can) you (give|provide) (more|additional)',
361
+ r'(go|dive) (into|deeper)',
362
+ r'(explain|elaborate) (this|that|it)',
363
+ r'(what|how) (do|does|about) (that|this|it)',
364
+ r'(why|how) (is|are|was|were) (that|this|it)',
365
+ r'(more|examples)',
366
+ r'(please|pls)'
367
+ ]
368
+
369
+ query_lower = query.lower()
370
+
371
+ # Direct check for common follow-up phrases
372
+ if any(re.search(pattern, query_lower) for pattern in follow_up_patterns):
373
+ return True
374
+
375
+ # Simple phrases that indicate follow-up
376
+ follow_up_phrases = [
377
+ "more", "further", "continue", "go on", "what else", "and", "also", "in addition",
378
+ "next", "then", "after", "what about", "tell me more", "elaborate", "explain"
379
+ ]
380
+
381
+ # Check for these phrases
382
+ for phrase in follow_up_phrases:
383
+ if phrase in query_lower:
384
+ return True
385
+
386
+ return False
387
+ # Improved context management function
388
+ def manage_conversation_context(max_history=10):
389
+ """Maintain a sliding window of conversation history to prevent context overflow."""
390
+ # Limit the history to the most recent exchanges
391
+ if len(st.session_state.conversation_context) > max_history * 2: # Each exchange is 2 entries (Q&A)
392
+ # Keep the most recent exchanges
393
+ st.session_state.conversation_context = st.session_state.conversation_context[-max_history * 2:]
394
+
395
+ # Also limit question and answer history
396
+ if len(st.session_state.question_history) > max_history:
397
+ st.session_state.question_history = st.session_state.question_history[-max_history:]
398
+
399
+ if len(st.session_state.answer_history) > max_history:
400
+ st.session_state.answer_history = st.session_state.answer_history[-max_history:]
401
+
402
+ # Function to check if a question is new or repeat
403
+ def is_new_question(question):
404
+ """Check if a question is new by comparing its hash with previously asked questions."""
405
+ # Normalize the question text (lowercase, remove punctuation)
406
+ normalized = re.sub(r'[^\w\s]', '', question.lower())
407
+
408
+ # Calculate hash
409
+ question_hash = hashlib.md5(normalized.encode()).hexdigest()
410
+
411
+ # Check if we've seen this question before
412
+ if question_hash in st.session_state.question_hash_set:
413
+ return False
414
+
415
+ # Add to our set of seen questions
416
+ st.session_state.question_hash_set.add(question_hash)
417
+ return True
418
+
419
+ # Improved function to identify if a query is a follow-up question from our suggested follow-ups
420
+ def is_suggested_follow_up(query):
421
+ """Check if the query matches one of our previously suggested follow-up questions."""
422
+ if not query or len(st.session_state.messages) < 2:
423
+ return False, None
424
+
425
+ # Clean the query
426
+ clean_query = query.strip().lower().rstrip('?')
427
+
428
+ # Look through recent assistant messages for suggested follow-ups
429
+ for i, msg in enumerate(reversed(st.session_state.messages)):
430
+ if msg["role"] == "assistant" and i < 6: # Only check recent messages
431
+ follow_up_match = re.search(r'πŸ’‘ \*\*Follow-up question:\*\* (.*?)$', msg["content"])
432
+ if follow_up_match:
433
+ suggested = follow_up_match.group(1).strip().lower().rstrip('?')
434
+
435
+ # Check similarity - exact match or very high similarity
436
+ if clean_query == suggested:
437
+ return True, msg["content"]
438
+
439
+ # Check if they're very similar (e.g., minor rewording)
440
+ similarity = calculate_text_similarity(clean_query, suggested)
441
+ if similarity > 0.85: # High threshold for similarity
442
+ return True, msg["content"]
443
+
444
+ return False, None
445
+
446
+ # Helper function to calculate text similarity
447
+ def calculate_text_similarity(text1, text2):
448
+ """Calculate similarity between two text strings."""
449
+ try:
450
+ # Use sentence model to calculate similarity
451
+ embed1 = sentence_model.encode(text1, convert_to_tensor=True)
452
+ embed2 = sentence_model.encode(text2, convert_to_tensor=True)
453
+
454
+ similarity = util.pytorch_cos_sim(embed1, embed2).item()
455
+ return similarity
456
+ except Exception as e:
457
+ print(f"Error calculating similarity: {e}")
458
+ return 0.0
459
+
460
+
461
+ # Check if this is one of our suggested follow-up questions
462
+ is_follow_up, previous_content = is_suggested_follow_up(prompt)
463
+
464
+ # If it's a follow-up question we suggested, treat it as a new question
465
+ if is_follow_up:
466
+ # We want to answer this as a new query, not elaborate on the previous topic
467
+ pass
468
+
469
+ # Filter documents by relevance
470
+ relevant_docs, similarity_scores = check_document_relevance(prompt, context_docs, min_similarity=0.2)
471
+
472
+ # Extract sources
473
+ sources = set()
474
+ has_relevant_info = len(relevant_docs) > 0
475
+
476
+ for doc in relevant_docs:
477
+ if hasattr(doc, "metadata") and "source" in doc.metadata:
478
+ sources.add(doc.metadata["source"])
479
+
480
+ # If no relevant context was found in the PDFs
481
+ if not has_relevant_info:
482
+ # No specific information - generate a simple response
483
+ answer = generate_no_docs_response(prompt)
484
+ answer += f"\n\nπŸ’‘ **Follow-up question:** Would you like to explore a topic from the educational documents instead?"
485
+ return answer, None, False, "Would you like to explore a topic from the educational documents instead?"
486
+
487
+ # Add the question to our history
488
+ if is_new_question(prompt):
489
+ st.session_state.question_history.append(prompt)
490
+
491
+ # Generate response from model
492
+ raw_response = generate_response_from_model(prompt)
493
+
494
+ # Post-process the response
495
+ final_response, new_follow_up = post_process_response(raw_response, prompt, ", ".join(sorted(sources)))
496
+
497
+ # Add the answer to our history
498
+ answer_only = re.sub(r'πŸ’‘ \*\*Follow-up question:\*\*.*$', '', final_response, flags=re.DOTALL).strip()
499
+ answer_only = re.sub(r'πŸ“Œ \*\*Source:\*\*.*$', '', answer_only, flags=re.DOTALL).strip()
500
+ st.session_state.answer_history.append(answer_only)
501
+
502
+ # Manage context size
503
+ manage_conversation_context()
504
+
505
+ return final_response, ", ".join(sorted(sources)), False, new_follow_up
506
+
507
+
508
+ def clean_model_output(raw_response):
509
+ """Thoroughly clean the model output to remove all prompt instructions and artifacts."""
510
+ # First pass: Remove common model prefixes
511
+ if "You are" in raw_response or "I am" in raw_response or "Based on" in raw_response:
512
+ content_start = None
513
+
514
+ # Look for paragraph breaks after standard prefixes and preambles
515
+ for pattern in [
516
+ "The current date is",
517
+ "headquartered in Paris",
518
+ "Based on your knowledge",
519
+ "Based on the information",
520
+ "Answer this question",
521
+ "You are an educational",
522
+ "I am an AI",
523
+ "As an educational"
524
+ ]:
525
+ pattern_loc = raw_response.find(pattern)
526
+ if pattern_loc > -1:
527
+ # Find the end of this paragraph or a period
528
+ para_end = raw_response.find("\n\n", pattern_loc)
529
+ period_end = raw_response.find(". ", pattern_loc)
530
+
531
+ # Use whichever end we find first (and is valid)
532
+ if para_end > -1 and period_end > -1:
533
+ end_pos = min(para_end, period_end)
534
+ elif para_end > -1:
535
+ end_pos = para_end
536
+ elif period_end > -1:
537
+ end_pos = period_end + 1 # Include the period
538
+ else:
539
+ end_pos = -1
540
+
541
+ if end_pos > -1 and (content_start is None or end_pos > content_start):
542
+ content_start = end_pos + 2 # Skip past the end marker
543
+
544
+ # If we found a break point, skip everything before it
545
+ if content_start and content_start < len(raw_response):
546
+ raw_response = raw_response[content_start:]
547
+
548
+ # Remove strings that indicate a prompt or instruction
549
+ prompt_indicators = [
550
+ "Based on your knowledge, create a response",
551
+ "Answer this question based ONLY on the information provided below:",
552
+ "Answer this question:",
553
+ "Question:",
554
+ "Information:",
555
+ "Be concise, educational, and helpful.",
556
+ "End with a thoughtful follow-up question",
557
+ "Answer based on",
558
+ "This means that",
559
+ "A related follow-up question",
560
+ "Use this information:",
561
+ "Based on your knowledge"
562
+ ]
563
+
564
+ for indicator in prompt_indicators:
565
+ if indicator in raw_response:
566
+ start_index = raw_response.find(indicator)
567
+ # Find end of line or paragraph or sentence
568
+ end_options = [
569
+ raw_response.find("\n\n", start_index),
570
+ raw_response.find("\n", start_index),
571
+ raw_response.find(". ", start_index)
572
+ ]
573
+ # Filter out -1 values and find the closest endpoint
574
+ end_options = [x for x in end_options if x > -1]
575
+ if end_options:
576
+ end_index = min(end_options)
577
+ if end_index > start_index:
578
+ # If it ends with a period, include it
579
+ if raw_response[end_index:end_index+2] == ". ":
580
+ end_index += 1
581
+ raw_response = raw_response[:start_index] + raw_response[end_index+1:]
582
+ else:
583
+ # If no endpoint found, just remove the indicator
584
+ raw_response = raw_response.replace(indicator, "")
585
+
586
+ # Remove lines that start with typical system message indicators
587
+ lines = raw_response.split("\n")
588
+ cleaned_lines = []
589
+
590
+ skip_patterns = [
591
+ "answer this question",
592
+ "question:",
593
+ "information:",
594
+ "you are",
595
+ "i am",
596
+ "the current date is",
597
+ "be concise",
598
+ "end with",
599
+ "provide a detailed",
600
+ "follow-up question",
601
+ "use this information",
602
+ "based on your knowledge"
603
+ ]
604
+
605
+ for line in lines:
606
+ lower_line = line.lower()
607
+ if not any(lower_line.startswith(pattern) for pattern in skip_patterns):
608
+ if not any(pattern in lower_line for pattern in ["based only on", "concise and helpful"]):
609
+ cleaned_lines.append(line)
610
+
611
+ # Rejoin cleaned lines
612
+ cleaned_text = "\n".join(cleaned_lines)
613
+
614
+ # Remove any isolated "Information:" or "Related follow-up:"
615
+ cleaned_text = re.sub(r'(?:^|\n)Information:(?:\n|$)', '\n', cleaned_text)
616
+ cleaned_text = re.sub(r'(?:^|\n)Question:(?:\n|$)', '\n', cleaned_text)
617
+
618
+ # Remove the follow-up question section
619
+ follow_up_patterns = [
620
+ r'Follow-up Question:.*?$',
621
+ r'Follow-up question:.*?$',
622
+ r'\*\*Follow-up question:\*\*.*?$',
623
+ r'\*\*Follow-up Question:\*\*.*?$'
624
+ ]
625
+
626
+ for pattern in follow_up_patterns:
627
+ cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.DOTALL)
628
+
629
+ # Remove any trailing system instructions
630
+ cleaned_text = re.sub(r'\[insert thoughtful follow-up.*?\]', '', cleaned_text, flags=re.DOTALL)
631
+
632
+ # Clean up excessive whitespace
633
+ cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
634
+
635
+ # Finally, clean up extra spaces and trim
636
+ return cleaned_text.strip()
637
+
638
+ def extract_follow_up_question(context_text, prev_question=None):
639
+ """Generate a contextually appropriate follow-up question."""
640
+ # If we already asked a follow-up question, avoid repetition
641
+ if prev_question and "key differences between early chatbots" in prev_question:
642
+ return "What are some applications of chatbots in various industries?"
643
+
644
+ # Find keywords in the context to generate a relevant question
645
+ context_lower = context_text.lower()
646
+
647
+ if "chatbot" in context_lower or "eliza" in context_lower:
648
+ return "What are some key differences between early chatbots like ELIZA and modern conversational AI systems?"
649
+
650
+ elif "deepseek" in context_lower:
651
+ return "How does DeepSeek-R1 compare to other large language models in terms of reasoning capabilities?"
652
+
653
+ elif "knowledge distillation" in context_lower:
654
+ return "What are other techniques besides knowledge distillation that can make large models more efficient?"
655
+
656
+ elif "language model" in context_lower or "model" in context_lower:
657
+ return "What challenges do researchers face when developing more powerful language models?"
658
+
659
+ elif "reasoning" in context_lower:
660
+ return "How do reasoning capabilities in AI systems differ from human reasoning processes?"
661
+
662
+ # Default follow-up
663
+ return "What other aspects of this topic would you like to explore?"
664
+
665
+ def is_conversational_input(prompt):
666
+ """Check if the user input is conversational rather than a document query."""
667
+ conversational_patterns = [
668
+ r'^(hi|hello|hey|greetings|howdy)[\s!.?]*$',
669
+ r'^(how are you|how\'s it going|what\'s up|how do you do)[\s!.?]*$',
670
+ r'^(good morning|good afternoon|good evening|good night)[\s!.?]*$',
671
+ r'^(thanks|thank you|thx|ty)[\s!.?]*$',
672
+ r'^(bye|goodbye|see you|farewell)[\s!.?]*$',
673
+ r'^(clear|reset|start over|new conversation)[\s!.?]*$'
674
+ ]
675
+
676
+ prompt_lower = prompt.lower().strip()
677
+ return any(re.match(pattern, prompt_lower) for pattern in conversational_patterns)
678
+
679
+ def generate_conversational_response(prompt):
680
+ """Generate a friendly conversational response with educational follow-ups."""
681
+ prompt_lower = prompt.lower().strip()
682
+
683
+ if re.match(r'^(hi|hello|hey|greetings|howdy)[\s!.?]*$', prompt_lower):
684
+ return "Hello! I'm your educational assistant. I can help you understand concepts from the documents or answer your questions. What would you like to learn about today?", True
685
+
686
+ elif re.match(r'^(how are you|how\'s it going|what\'s up|how do you do)[\s!.?]*$', prompt_lower):
687
+ return "I'm here and ready to help you learn! What topic from the documents would you like to explore today?", True
688
+
689
+ elif re.match(r'^(good morning|good afternoon|good evening)[\s!.?]*$', prompt_lower):
690
+ return f"{prompt.capitalize()}! What educational topics are you interested in exploring today?", True
691
+
692
+ elif re.match(r'^(thanks|thank you|thx|ty)[\s!.?]*$', prompt_lower):
693
+ return "You're welcome! Learning is a journey we take together. Would you like to explore another topic from the documents?", True
694
+
695
+ elif re.match(r'^(bye|goodbye|see you|farewell)[\s!.?]*$', prompt_lower):
696
+ return "Goodbye! Remember, learning is a lifelong journey. Feel free to return when you have more questions!", False
697
+
698
+ elif re.match(r'^(clear|reset|start over|new conversation)[\s!.?]*$', prompt_lower):
699
+ return "I'll start a new conversation. Your previous conversation history has been cleared.", True
700
+
701
+ else:
702
+ return "I'm here to help you learn. What specific topic from the documents would you like to explore?", True
703
+
704
+ def detect_conversation_topic_shift(prompt, conversation_history, threshold=0.4):
705
+ """Detect if the conversation is shifting to a new topic."""
706
+ if len(conversation_history) < 2:
707
+ return False, 0.0
708
+
709
+ # Get the average embedding of the last few exchanges (up to 3)
710
+ recent_exchanges = conversation_history[-min(6, len(conversation_history)):]
711
+ recent_text = " ".join(recent_exchanges)
712
+
713
+ prompt_embedding = sentence_model.encode(prompt, convert_to_tensor=True)
714
+ recent_embedding = sentence_model.encode(recent_text, convert_to_tensor=True)
715
+
716
+ similarity = util.pytorch_cos_sim(prompt_embedding, recent_embedding).item()
717
+
718
+ return similarity < threshold, similarity
719
+
720
+ def extract_information_from_docs(docs, limit=2000):
721
+ """Extract information from documents up to a character limit."""
722
+ extracted_text = ""
723
+ current_length = 0
724
+
725
+ for doc in docs:
726
+ if not hasattr(doc, "page_content"):
727
+ continue
728
+
729
+ if current_length + len(doc.page_content) <= limit:
730
+ extracted_text += doc.page_content + "\n\n"
731
+ current_length += len(doc.page_content) + 2
732
+ else:
733
+ # Add a partial chunk to reach the limit
734
+ remaining = limit - current_length
735
+ if remaining > 100: # Only add if we can get a meaningful chunk
736
+ extracted_text += doc.page_content[:remaining] + "..."
737
+ break
738
+
739
+ return extracted_text.strip()
740
+
741
+ def post_process_response(response, prompt, sources=None, prev_follow_up=None):
742
+ """Format the response with proper source citation and follow-up."""
743
+ # Clean the response
744
+ clean_response = clean_model_output(response)
745
+
746
+ # Generate a follow-up question based on the content
747
+ follow_up = extract_follow_up_question(clean_response, prev_follow_up)
748
+
749
+ # Add source citation if available
750
+ if sources:
751
+ clean_response += f"\n\nπŸ“Œ **Source:** {sources}"
752
+
753
+ # Add the follow-up question
754
+ clean_response += f"\n\nπŸ’‘ **Follow-up question:** {follow_up}"
755
+
756
+ return clean_response, follow_up
757
+
758
+ # Modified generate_response_from_model function to handle different question types
759
+ def generate_response_from_model(prompt, is_elaboration=False):
760
+ """Generate a direct response from the model without any document context or content."""
761
+ if model is None or tokenizer is None:
762
+ return "Error: Model could not be loaded."
763
+
764
+ # Determine the prompt type
765
+ if is_elaboration:
766
+ model_prompt = "Provide more information and details about this topic."
767
+ else:
768
+ model_prompt = "Answer this question directly and factually."
769
+
770
+ try:
771
+ # Generate response
772
+ with st.spinner("Generating response..."):
773
+ # Format for model
774
+ system_message = "You are a helpful educational assistant that provides factual information about topics related to AI, language models, and conversational systems. Answer the question directly without repeating the question."
775
+ user_message = f"{model_prompt} Question: {prompt}"
776
+
777
+ if hasattr(tokenizer, "apply_chat_template"):
778
+ messages = [
779
+ {"role": "system", "content": system_message},
780
+ {"role": "user", "content": user_message}
781
+ ]
782
+ inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
783
+ else:
784
+ combined_prompt = f"{system_message}\n\nUser: {user_message}"
785
+ inputs = tokenizer(combined_prompt, return_tensors="pt").to("cuda")
786
+
787
+ # Generate with increased token limit
788
+ outputs = model.generate(
789
+ inputs,
790
+ max_new_tokens=500,
791
+ temperature=0.7,
792
+ top_p=0.9,
793
+ do_sample=True,
794
+ eos_token_id=tokenizer.eos_token_id,
795
+ pad_token_id=tokenizer.eos_token_id,
796
+ repetition_penalty=1.1
797
+ )
798
+
799
+ # Decode
800
+ raw_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
801
+
802
+ # More extensive cleaning to avoid including the question in the answer
803
+ raw_response = re.sub(r'.*?(As an AI|I am an AI|You asked about|In response to your question|Regarding your question)', '', raw_response, flags=re.DOTALL)
804
+ raw_response = raw_response.lstrip()
805
+
806
+ # Remove any instances where the model repeats the question
807
+ question_patterns = [
808
+ f"Question: {re.escape(prompt)}",
809
+ f"{re.escape(prompt)}",
810
+ "The question is about",
811
+ "You asked about"
812
+ ]
813
+
814
+ for pattern in question_patterns:
815
+ raw_response = re.sub(pattern, '', raw_response, flags=re.IGNORECASE)
816
+
817
+ return raw_response
818
+
819
+ except Exception as e:
820
+ st.error(f"Error generating response: {str(e)}")
821
+ return f"I'm sorry, there was an error generating a response. Error: {str(e)}"
822
+
823
+ def generate_no_docs_response(prompt):
824
+ """Generate a response when no relevant docs are found."""
825
+ response = "The documents don't contain information about this topic. "
826
+
827
+ # Add a gentle reminder about the scope of the assistant
828
+ if any(x in prompt.lower() for x in ["recipe", "cooking", "food", "baking",
829
+ "soup", "meal", "ingredient", "dish"]):
830
+ response += "I'm an educational assistant focused on the documents provided, which don't discuss cooking recipes."
831
+ else:
832
+ response += "I'm focused on the educational content in the provided documents."
833
+
834
+ return response
835
+ # Complete process_query function definition
836
+ def process_query(prompt, context_docs):
837
+ """Process different types of queries appropriately."""
838
+ # First check if this is a conversational input
839
+ if is_conversational_input(prompt):
840
+ response, should_continue = generate_conversational_response(prompt)
841
+
842
+ # Check if this is a reset request
843
+ if re.match(r'^(clear|reset|start over|new conversation)[\s!.?]*$', prompt.lower().strip()):
844
+ return response, None, True, None
845
+
846
+ return response, None, False, None
847
+
848
+ # Check if this is one of our suggested follow-up questions
849
+ is_follow_up, previous_content = is_suggested_follow_up(prompt)
850
+
851
+ # Get previous follow-up question if any
852
+ prev_follow_up = None
853
+ if len(st.session_state.messages) > 0:
854
+ for msg in reversed(st.session_state.messages):
855
+ if msg["role"] == "assistant":
856
+ follow_up_match = re.search(r'πŸ’‘ \*\*Follow-up question:\*\* (.*?)$', msg["content"])
857
+ if follow_up_match:
858
+ prev_follow_up = follow_up_match.group(1)
859
+ break
860
+
861
+ # Handle follow-up/elaboration requests specifically
862
+ if (is_follow_up_request(prompt) or is_follow_up) and len(st.session_state.conversation_context) >= 2:
863
+ # If it's a follow-up from our suggestions, treat it as a new question
864
+ if is_follow_up:
865
+ # This is a suggested follow-up - treat it as a new question
866
+ pass # Continue with normal processing
867
+ else:
868
+ # This is a user asking for elaboration
869
+ # Get the previous exchange (original question)
870
+ original_query = None
871
+ for i in range(len(st.session_state.conversation_context)-2, -1, -2):
872
+ if i < len(st.session_state.conversation_context):
873
+ original_query = st.session_state.conversation_context[i]
874
+ break
875
+
876
+ if not original_query:
877
+ original_query = st.session_state.conversation_context[-2] # Fallback
878
+
879
+ # Generate an elaborated response
880
+ raw_response = generate_response_from_model(original_query, is_elaboration=True)
881
+
882
+ # Get sources from previous response if available
883
+ sources = None
884
+ for msg in reversed(st.session_state.messages):
885
+ if msg["role"] == "assistant":
886
+ source_match = re.search(r'πŸ“Œ \*\*Source:\*\* (.*?)$', msg["content"])
887
+ if source_match:
888
+ sources = source_match.group(1)
889
+ break
890
+
891
+ final_response, new_follow_up = post_process_response(raw_response, original_query, sources=sources, prev_follow_up=prev_follow_up)
892
+
893
+ return final_response, sources, False, new_follow_up
894
+
895
+ # Not a follow-up, process as a new query
896
+ # Detect topic shift
897
+ topic_shift_warning = ""
898
+ if len(st.session_state.conversation_context) >= 4:
899
+ is_topic_shift, similarity_score = detect_conversation_topic_shift(prompt, st.session_state.conversation_context)
900
+ if is_topic_shift:
901
+ topic_shift_warning = "⚠️ It seems you're starting a new topic. I'll try to answer, but keep in mind this is different from what we were discussing. "
902
+
903
+ # Filter documents by relevance
904
+ relevant_docs, similarity_scores = check_document_relevance(prompt, context_docs, min_similarity=0.2)
905
+
906
+ # Extract sources
907
+ sources = set()
908
+ has_relevant_info = len(relevant_docs) > 0
909
+
910
+ for doc in relevant_docs:
911
+ if hasattr(doc, "metadata") and "source" in doc.metadata:
912
+ sources.add(doc.metadata["source"])
913
+
914
+ # If no relevant context was found in the PDFs
915
+ if not has_relevant_info:
916
+ # No specific information - generate a simple response
917
+ answer = topic_shift_warning + generate_no_docs_response(prompt)
918
+ answer += f"\n\nπŸ’‘ **Follow-up question:** Would you like to explore a topic from the educational documents instead?"
919
+ return answer, None, False, "Would you like to explore a topic from the educational documents instead?"
920
+
921
+ # Add the question to our history
922
+ if is_new_question(prompt):
923
+ st.session_state.question_history.append(prompt)
924
+
925
+ # Generate response from model
926
+ raw_response = generate_response_from_model(prompt)
927
+
928
+ # Post-process the response
929
+ final_response, new_follow_up = post_process_response(raw_response, prompt, ", ".join(sorted(sources)), prev_follow_up)
930
+
931
+ # Add topic shift warning if needed
932
+ if topic_shift_warning:
933
+ final_response = topic_shift_warning + final_response
934
+
935
+ # Add the answer to our history
936
+ answer_only = re.sub(r'πŸ’‘ \*\*Follow-up question:\*\*.*$', '', final_response, flags=re.DOTALL).strip()
937
+ answer_only = re.sub(r'πŸ“Œ \*\*Source:\*\*.*$', '', answer_only, flags=re.DOTALL).strip()
938
+ st.session_state.answer_history.append(answer_only)
939
+
940
+ # Manage context size
941
+ manage_conversation_context()
942
+
943
+ return final_response, ", ".join(sorted(sources)), False, new_follow_up
944
+ def generate_response(prompt, context_docs, conversation_history):
945
+ """Generate an educational response with context awareness and follow-up questions."""
946
+ # Reset flag
947
+ should_reset = False
948
+
949
+ # Check if this is a conversational input
950
+ if is_conversational_input(prompt):
951
+ response, should_continue = generate_conversational_response(prompt)
952
+
953
+ # Check if this is a reset request
954
+ if re.match(r'^(clear|reset|start over|new conversation)[\s!.?]*$', prompt.lower().strip()):
955
+ # Set the reset flag
956
+ should_reset = True
957
+
958
+ return response, None, should_reset, None
959
+
960
+ # Get previous follow-up question if any
961
+ prev_follow_up = None
962
+ if len(st.session_state.messages) > 0:
963
+ for msg in reversed(st.session_state.messages):
964
+ if msg["role"] == "assistant":
965
+ follow_up_match = re.search(r'πŸ’‘ \*\*Follow-up question:\*\* (.*?)$', msg["content"])
966
+ if follow_up_match:
967
+ prev_follow_up = follow_up_match.group(1)
968
+ break
969
+
970
+ # Handle follow-up/elaboration requests specifically
971
+ if is_follow_up_request(prompt) and len(conversation_history) >= 2:
972
+ # Get the previous exchange
973
+ prev_query = conversation_history[-2] # Previous user query
974
+ prev_answer = conversation_history[-1] # Previous assistant answer
975
+
976
+ # Generate an elaborated response without document content
977
+ raw_response = generate_response_from_model(prev_query, is_elaboration=True)
978
+ final_response, new_follow_up = post_process_response(raw_response, prev_query, sources=None, prev_follow_up=prev_follow_up)
979
+
980
+ return final_response, None, should_reset, new_follow_up
981
+
982
+ # Not a follow-up, process as a new query
983
+ # Detect topic shift
984
+ topic_shift_warning = ""
985
+ if len(conversation_history) >= 4:
986
+ is_topic_shift, similarity_score = detect_conversation_topic_shift(prompt, conversation_history)
987
+ if is_topic_shift:
988
+ topic_shift_warning = "⚠️ It seems you're starting a new topic. I'll try to answer, but keep in mind this is different from what we were discussing. "
989
+
990
+ # Filter documents by relevance
991
+ relevant_docs, similarity_scores = check_document_relevance(prompt, context_docs, min_similarity=0.2)
992
+
993
+ # Extract sources
994
+ sources = set()
995
+ has_relevant_info = len(relevant_docs) > 0
996
+
997
+ for doc in relevant_docs:
998
+ if hasattr(doc, "metadata") and "source" in doc.metadata:
999
+ sources.add(doc.metadata["source"])
1000
+
1001
+ # If no relevant context was found in the PDFs
1002
+ if not has_relevant_info:
1003
+ # No specific information - generate a simple response
1004
+ answer = topic_shift_warning + generate_no_docs_response(prompt)
1005
+ answer += f"\n\nπŸ’‘ **Follow-up question:** Would you like to explore a topic from the educational documents instead?"
1006
+ return answer, None, should_reset, "Would you like to explore a topic from the educational documents instead?"
1007
+
1008
+ # Generate response from model - don't include document text to avoid leakage
1009
+ raw_response = generate_response_from_model(prompt)
1010
+
1011
+ # Post-process the response
1012
+ final_response, new_follow_up = post_process_response(raw_response, prompt, ", ".join(sorted(sources)), prev_follow_up)
1013
+
1014
+ # Add topic shift warning if needed
1015
+ if topic_shift_warning:
1016
+ final_response = topic_shift_warning + final_response
1017
+
1018
+ return final_response, ", ".join(sorted(sources)), should_reset, new_follow_up
1019
+
1020
+ # Streamlit App UI
1021
+ st.title("πŸ“– Educational PDF Chatbot")
1022
+
1023
+ # Add info section
1024
+ st.sidebar.title("System Info")
1025
+ st.sidebar.info("Educational Assistant")
1026
+ st.sidebar.write("Documents loaded:")
1027
+ for pdf in PDF_FILES:
1028
+ st.sidebar.write(f"- {pdf}")
1029
+
1030
+ # Initialize session state for chat history
1031
+ if "messages" not in st.session_state:
1032
+ st.session_state.messages = []
1033
+ # Add welcome message
1034
+ welcome_msg = "Hello! I'm your educational assistant. I can help you understand concepts in the documents. What would you like to explore today?"
1035
+ st.session_state.messages.append({"role": "assistant", "content": welcome_msg})
1036
+
1037
+ # Initialize conversation context tracker
1038
+ if "conversation_context" not in st.session_state:
1039
+ st.session_state.conversation_context = []
1040
+
1041
+ # Add session state for tracking conversation length for potential warnings
1042
+ if "conversation_turns" not in st.session_state:
1043
+ st.session_state.conversation_turns = 0
1044
+
1045
+ # Add session state for tracking follow-up questions to avoid repetition
1046
+ if "prev_follow_up" not in st.session_state:
1047
+ st.session_state.prev_follow_up = None
1048
+
1049
+ # Add a button to clear conversation
1050
+ col1, col2 = st.columns([4, 1])
1051
+ with col2:
1052
+ if st.button("New Conversation"):
1053
+ st.session_state.conversation_context = []
1054
+ st.session_state.conversation_turns = 0
1055
+ st.session_state.messages = []
1056
+ st.session_state.prev_follow_up = None
1057
+ welcome_msg = "Starting a new conversation. What would you like to learn about today?"
1058
+ st.session_state.messages.append({"role": "assistant", "content": welcome_msg})
1059
+ st.rerun()
1060
+
1061
+ if retriever:
1062
+ # Display chat messages
1063
+ for message in st.session_state.messages:
1064
+ with st.chat_message(message["role"]):
1065
+ st.markdown(message["content"])
1066
+
1067
+ # User input
1068
+ if prompt := st.chat_input("What would you like to learn today?"):
1069
+ # Add user message to history
1070
+ st.session_state.messages.append({"role": "user", "content": prompt})
1071
+ st.session_state.conversation_context.append(prompt)
1072
+ st.session_state.conversation_turns += 1
1073
+
1074
+ with st.chat_message("user"):
1075
+ st.markdown(prompt)
1076
+
1077
+ # Generate response
1078
+ with st.chat_message("assistant"):
1079
+ with st.spinner("Thinking..."):
1080
+ try:
1081
+ # Process query
1082
+ retrieved_docs = retriever.get_relevant_documents(prompt)
1083
+
1084
+ answer, sources, should_reset, new_follow_up = process_query(prompt, retrieved_docs)
1085
+
1086
+ # Handle conversation reset if needed
1087
+ if should_reset:
1088
+ st.session_state.conversation_context = []
1089
+ st.session_state.conversation_turns = 0
1090
+ st.session_state.messages = []
1091
+ st.session_state.question_history = []
1092
+ st.session_state.answer_history = []
1093
+ st.session_state.question_hash_set = set()
1094
+ st.session_state.messages.append({"role": "assistant", "content": answer})
1095
+ st.rerun()
1096
+
1097
+ # Store response in chat history
1098
+ st.session_state.messages.append({"role": "assistant", "content": answer})
1099
+
1100
+ # Store just the answer text without sources and follow-up in conversation context
1101
+ answer_only = re.sub(r'πŸ’‘ \*\*Follow-up question:\*\*.*$', '', answer, flags=re.DOTALL).strip()
1102
+ answer_only = re.sub(r'πŸ“Œ \*\*Source:\*\*.*$', '', answer_only, flags=re.DOTALL).strip()
1103
+ st.session_state.conversation_context.append(answer_only)
1104
+
1105
+ # Display the formatted response
1106
+ st.markdown(answer)
1107
+
1108
+ except Exception as e:
1109
+ error_msg = f"An error occurred: {str(e)}"
1110
+ st.error(error_msg)
1111
+ st.session_state.messages.append({"role": "assistant", "content": error_msg})
1112
+
1113
+ else:
1114
+ st.error("Failed to load document retrieval system.")
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ langchain
3
+ langchain-core
4
+ langchain_community
5
+ langchain_huggingface
6
+ PyPDF2
7
+ chromadb==0.4.24
8
+ uvicorn
9
+ pymupdf
10
+ pypdf
11
+ python-dotenv
12
+ transformers
13
+ sentence-transformers
14
+ accelerate>=0.26.0
15
+ bitsandbytes>=0.41.1
16
+ sentencepiece