import streamlit as st from PyPDF2 import PdfReader from langchain_community.vectorstores import Chroma from langchain.text_splitter import CharacterTextSplitter from langchain.chat_models import ChatOpenAI from tempfile import NamedTemporaryFile import os from processPDF import process_pdf_with_ocr from langchain.embeddings import OpenAIEmbeddings from langchain.chains import ConversationalRetrievalChain from langchain.memory import ConversationBufferMemory from langchain.docstore.document import Document import shutil from langchain.prompts import ( ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate ) from langchain.memory import ConversationBufferMemory from langchain.prompts import PromptTemplate from openai import OpenAI # Streamlit App Configuration st.set_page_config(page_title="Multi-PDF Chat", layout="wide") st.title("📄 Multi-PDF Question-Answering Chatbot") # Create a custom system prompt system_prompt = SystemMessagePromptTemplate.from_template(""" You are an advanced PDF analysis AI assistant. Your key responsibilities are: - Provide precise and accurate answers based on the document contents - Extract relevant information directly from the uploaded PDFs - Maintain context from previous interactions - Prioritize clarity and factual accuracy in your responses - Give a very detailed answer with a detailed explaination Think step by step and answer the question. Your life depends on it. Be very careful and precise in answering the question. Assume you are giving a exam and more accurate the anwer you give more points you will get. Use the provided context and chat history to formulate a comprehensive answer. Always ground your response in the source material.""") # Create a human message template human_prompt = HumanMessagePromptTemplate.from_template("{question}") # Combine the prompts chat_prompt = ChatPromptTemplate.from_messages([ system_prompt, human_prompt ]) chat_prompt = ChatPromptTemplate.from_messages([ SystemMessagePromptTemplate.from_template(""" You are an advanced PDF analysis AI assistant. Your key responsibilities are: - Provide precise and accurate answers based on the document contents - Extract relevant information directly from the uploaded PDFs - Maintain context from previous interactions - Prioritize clarity and factual accuracy in your responses Think step by step and answer the question. Your life depends on it. Be very careful and precise in answering the question. Assume you are giving a exam and more accurate the anwer you give more points you will get. Thus always try to give an answer without saying I don't know. Use the provided context and chat history to formulate a comprehensive answer. Always ground your response in the source material. Reply only in English. Context: {context} """), HumanMessagePromptTemplate.from_template("{question}") ]) # Initialize session state variables if 'chat_history' not in st.session_state: st.session_state.chat_history = [] if 'pdf_processed' not in st.session_state: st.session_state.pdf_processed = False if 'qa_chain' not in st.session_state: st.session_state.qa_chain = None if 'memory' not in st.session_state: st.session_state.memory = ConversationBufferMemory( memory_key="chat_history", return_messages=True, output_key='answer' ) def processInput(question,client): prompt = f""" Given the user's question: {question} Expand and break down this question to include relevant context and key points that should be searched for. Return only the expanded question. The questions are related to an Financial organization Wells Fargo. """ completion = client.chat.completions.create( model="gpt-4o-mini", messages=[ {"role": "system", "content": "Follow the instructions and reply politely"}, {"role": "user", "content": "{}".format(prompt)} ], max_tokens=4000, ) print(completion.choices[0].message.content) return completion.choices[0].message.content # Function to process PDFs def process_pdfs(uploaded_files, openai_key): try: shutil.rmtree("Data") except: print("Not needed") all_documents = [] # Process each uploaded PDF for uploaded_file in uploaded_files: # Save uploaded file temporarily with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: temp_file.write(uploaded_file.read()) temp_pdf_path = temp_file.name # Extract text from PDF with page tracking pdf_reader = process_pdf_with_ocr(temp_pdf_path,openai_key) for page_num in pdf_reader: page_text = pdf_reader[page_num] # Create a document with page number metadata doc = Document( page_content=page_text, metadata={'source': uploaded_file.name, 'page': page_num} ) all_documents.append(doc) # Clean up temporary file os.unlink(temp_pdf_path) # Split documents into chunks while preserving metadata text_splitter = CharacterTextSplitter( chunk_size=1000, chunk_overlap=100, length_function=len, separator="\n" ) split_docs = text_splitter.split_documents(all_documents) # Embed and create vector store embeddings = OpenAIEmbeddings(openai_api_key=openai_key, model="text-embedding-3-large") vector_store = Chroma.from_documents(split_docs, embedding=embeddings, persist_directory="Data") # Configure retriever with simpler settings retriever = vector_store.as_retriever(search_kwargs={"k": 10}) # Set up QA chain with memory management llm = ChatOpenAI( temperature=0, openai_api_key=openai_key, model="gpt-4o-mini", max_tokens=500 ) qa_chain = ConversationalRetrievalChain.from_llm( llm=llm, retriever=retriever, memory=st.session_state.memory, return_source_documents=True, verbose=True, combine_docs_chain_kwargs={'prompt': chat_prompt} ) return qa_chain # Function to manage chat history length def manage_chat_history(): # Keep only the last 3 interactions to limit context if len(st.session_state.chat_history) > 3: st.session_state.chat_history = st.session_state.chat_history[-3:] # Sidebar for PDF upload if 'openai_key' not in st.session_state: st.session_state.openai_key = None with st.sidebar: st.header("Upload PDFs") uploaded_files = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True) if st.button("Clear Chat History"): st.session_state.chat_history = [] st.session_state.memory.clear() st.success("Chat history cleared!") if uploaded_files and not st.session_state.pdf_processed: if not st.session_state.openai_key: st.session_state.openai_key = st.text_input("Enter OpenAI API Key:", type="password") if st.session_state.openai_key: os.environ["OPENAI_API_KEY"] = st.session_state.openai_key with st.spinner("Processing PDFs..."): try: st.session_state.qa_chain = process_pdfs(uploaded_files, st.session_state.openai_key) st.session_state.pdf_processed = True st.success(f"Processed {len(uploaded_files)} PDF(s) successfully!") except Exception as e: st.error(f"Error processing PDFs: {str(e)}") st.session_state.pdf_processed = False # Main chat interface if st.session_state.pdf_processed and st.session_state.qa_chain is not None: # Display chat history for idx, (question, answer) in enumerate(st.session_state.chat_history, 1): with st.chat_message("user"): st.write(f"**Question {idx}:** {question}") with st.chat_message("assistant"): st.write(f"**Answer {idx}:** {answer}") # Chat input if user_question := st.chat_input("Ask a question about the PDFs"): try: # Run QA chain with error handling client = OpenAI() expanded_query = processInput(user_question,client) result = st.session_state.qa_chain({ "question": expanded_query, "chat_history": [] # Empty chat history to reduce tokens }) answer = result['answer'] # Update and display new interaction st.session_state.chat_history.append((user_question, answer)) manage_chat_history() # Manage chat history length with st.chat_message("user"): st.write(f"**Question:** {user_question}") with st.chat_message("assistant"): st.write(f"**Answer:** {answer}") # Show source documents with page numbers with st.expander("Source Documents"): for idx, doc in enumerate(result['source_documents'], 1): st.write(f"**Source {idx}:**") st.write(f"File: {doc.metadata.get('source', 'Unknown')}") st.write(f"Page: {doc.metadata.get('page', 'N/A')}") st.text(doc.page_content) except Exception as e: st.error(f"Error processing question: {str(e)}") st.warning("Please try asking a shorter question or breaking it into multiple parts.") else: st.info("Please upload PDF files in the sidebar to start chatting.")