OmkarGhugarkar's picture
Upload 3 files
3905e66 verified
import streamlit as st
from PyPDF2 import PdfReader
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from tempfile import NamedTemporaryFile
import os
from processPDF import process_pdf_with_ocr
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.docstore.document import Document
import shutil
from langchain.prompts import (
ChatPromptTemplate,
SystemMessagePromptTemplate,
HumanMessagePromptTemplate
)
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from openai import OpenAI
# Streamlit App Configuration
st.set_page_config(page_title="Multi-PDF Chat", layout="wide")
st.title("📄 Multi-PDF Question-Answering Chatbot")
# Create a custom system prompt
system_prompt = SystemMessagePromptTemplate.from_template("""
You are an advanced PDF analysis AI assistant. Your key responsibilities are:
- Provide precise and accurate answers based on the document contents
- Extract relevant information directly from the uploaded PDFs
- Maintain context from previous interactions
- Prioritize clarity and factual accuracy in your responses
- Give a very detailed answer with a detailed explaination
Think step by step and answer the question. Your life depends on it. Be very careful and precise in answering the question. Assume you are giving a exam and more accurate the anwer you give more points you will get.
Use the provided context and chat history to formulate a comprehensive answer. Always ground your response in the source material.""")
# Create a human message template
human_prompt = HumanMessagePromptTemplate.from_template("{question}")
# Combine the prompts
chat_prompt = ChatPromptTemplate.from_messages([
system_prompt,
human_prompt
])
chat_prompt = ChatPromptTemplate.from_messages([
SystemMessagePromptTemplate.from_template("""
You are an advanced PDF analysis AI assistant. Your key responsibilities are:
- Provide precise and accurate answers based on the document contents
- Extract relevant information directly from the uploaded PDFs
- Maintain context from previous interactions
- Prioritize clarity and factual accuracy in your responses
Think step by step and answer the question. Your life depends on it. Be very careful and precise in answering the question. Assume you are giving a exam and more accurate the anwer you give more points you will get. Thus always try to give an answer without saying I don't know.
Use the provided context and chat history to formulate a comprehensive answer. Always ground your response in the source material. Reply only in English.
Context: {context}
"""),
HumanMessagePromptTemplate.from_template("{question}")
])
# Initialize session state variables
if 'chat_history' not in st.session_state:
st.session_state.chat_history = []
if 'pdf_processed' not in st.session_state:
st.session_state.pdf_processed = False
if 'qa_chain' not in st.session_state:
st.session_state.qa_chain = None
if 'memory' not in st.session_state:
st.session_state.memory = ConversationBufferMemory(
memory_key="chat_history",
return_messages=True,
output_key='answer'
)
def processInput(question,client):
prompt = f"""
Given the user's question: {question}
Expand and break down this question to include relevant context and key points that should be searched for.
Return only the expanded question. The questions are related to an Financial organization Wells Fargo.
"""
completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "Follow the instructions and reply politely"},
{"role": "user", "content": "{}".format(prompt)}
],
max_tokens=4000,
)
print(completion.choices[0].message.content)
return completion.choices[0].message.content
# Function to process PDFs
def process_pdfs(uploaded_files, openai_key):
try:
shutil.rmtree("Data")
except:
print("Not needed")
all_documents = []
# Process each uploaded PDF
for uploaded_file in uploaded_files:
# Save uploaded file temporarily
with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
temp_file.write(uploaded_file.read())
temp_pdf_path = temp_file.name
# Extract text from PDF with page tracking
pdf_reader = process_pdf_with_ocr(temp_pdf_path,openai_key)
for page_num in pdf_reader:
page_text = pdf_reader[page_num]
# Create a document with page number metadata
doc = Document(
page_content=page_text,
metadata={'source': uploaded_file.name, 'page': page_num}
)
all_documents.append(doc)
# Clean up temporary file
os.unlink(temp_pdf_path)
# Split documents into chunks while preserving metadata
text_splitter = CharacterTextSplitter(
chunk_size=1000,
chunk_overlap=100,
length_function=len,
separator="\n"
)
split_docs = text_splitter.split_documents(all_documents)
# Embed and create vector store
embeddings = OpenAIEmbeddings(openai_api_key=openai_key, model="text-embedding-3-large")
vector_store = Chroma.from_documents(split_docs, embedding=embeddings, persist_directory="Data")
# Configure retriever with simpler settings
retriever = vector_store.as_retriever(search_kwargs={"k": 10})
# Set up QA chain with memory management
llm = ChatOpenAI(
temperature=0,
openai_api_key=openai_key,
model="gpt-4o-mini",
max_tokens=500
)
qa_chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=retriever,
memory=st.session_state.memory,
return_source_documents=True,
verbose=True,
combine_docs_chain_kwargs={'prompt': chat_prompt}
)
return qa_chain
# Function to manage chat history length
def manage_chat_history():
# Keep only the last 3 interactions to limit context
if len(st.session_state.chat_history) > 3:
st.session_state.chat_history = st.session_state.chat_history[-3:]
# Sidebar for PDF upload
if 'openai_key' not in st.session_state:
st.session_state.openai_key = None
with st.sidebar:
st.header("Upload PDFs")
uploaded_files = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True)
if st.button("Clear Chat History"):
st.session_state.chat_history = []
st.session_state.memory.clear()
st.success("Chat history cleared!")
if uploaded_files and not st.session_state.pdf_processed:
if not st.session_state.openai_key:
st.session_state.openai_key = st.text_input("Enter OpenAI API Key:", type="password")
if st.session_state.openai_key:
os.environ["OPENAI_API_KEY"] = st.session_state.openai_key
with st.spinner("Processing PDFs..."):
try:
st.session_state.qa_chain = process_pdfs(uploaded_files, st.session_state.openai_key)
st.session_state.pdf_processed = True
st.success(f"Processed {len(uploaded_files)} PDF(s) successfully!")
except Exception as e:
st.error(f"Error processing PDFs: {str(e)}")
st.session_state.pdf_processed = False
# Main chat interface
if st.session_state.pdf_processed and st.session_state.qa_chain is not None:
# Display chat history
for idx, (question, answer) in enumerate(st.session_state.chat_history, 1):
with st.chat_message("user"):
st.write(f"**Question {idx}:** {question}")
with st.chat_message("assistant"):
st.write(f"**Answer {idx}:** {answer}")
# Chat input
if user_question := st.chat_input("Ask a question about the PDFs"):
try:
# Run QA chain with error handling
client = OpenAI()
expanded_query = processInput(user_question,client)
result = st.session_state.qa_chain({
"question": expanded_query,
"chat_history": [] # Empty chat history to reduce tokens
})
answer = result['answer']
# Update and display new interaction
st.session_state.chat_history.append((user_question, answer))
manage_chat_history() # Manage chat history length
with st.chat_message("user"):
st.write(f"**Question:** {user_question}")
with st.chat_message("assistant"):
st.write(f"**Answer:** {answer}")
# Show source documents with page numbers
with st.expander("Source Documents"):
for idx, doc in enumerate(result['source_documents'], 1):
st.write(f"**Source {idx}:**")
st.write(f"File: {doc.metadata.get('source', 'Unknown')}")
st.write(f"Page: {doc.metadata.get('page', 'N/A')}")
st.text(doc.page_content)
except Exception as e:
st.error(f"Error processing question: {str(e)}")
st.warning("Please try asking a shorter question or breaking it into multiple parts.")
else:
st.info("Please upload PDF files in the sidebar to start chatting.")