Spaces:

OmkarGhugarkar
/

Multi-File-PDF-Chat-Application

Sleeping

App Files Files Community

Multi-File-PDF-Chat-Application / app.py

OmkarGhugarkar

Upload 3 files

3905e66 verified 4 months ago

raw

history blame contribute delete

9.74 kB

	import streamlit as st
	from PyPDF2 import PdfReader
	from langchain_community.vectorstores import Chroma
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.chat_models import ChatOpenAI
	from tempfile import NamedTemporaryFile
	import os
	from processPDF import process_pdf_with_ocr
	from langchain.embeddings import OpenAIEmbeddings
	from langchain.chains import ConversationalRetrievalChain
	from langchain.memory import ConversationBufferMemory
	from langchain.docstore.document import Document
	import shutil
	from langchain.prompts import (
	ChatPromptTemplate,
	SystemMessagePromptTemplate,
	HumanMessagePromptTemplate
	)
	from langchain.memory import ConversationBufferMemory
	from langchain.prompts import PromptTemplate
	from openai import OpenAI

	# Streamlit App Configuration
	st.set_page_config(page_title="Multi-PDF Chat", layout="wide")
	st.title("📄 Multi-PDF Question-Answering Chatbot")

	# Create a custom system prompt
	system_prompt = SystemMessagePromptTemplate.from_template("""
	You are an advanced PDF analysis AI assistant. Your key responsibilities are:
	- Provide precise and accurate answers based on the document contents
	- Extract relevant information directly from the uploaded PDFs
	- Maintain context from previous interactions
	- Prioritize clarity and factual accuracy in your responses
	- Give a very detailed answer with a detailed explaination

	Think step by step and answer the question. Your life depends on it. Be very careful and precise in answering the question. Assume you are giving a exam and more accurate the anwer you give more points you will get.
	Use the provided context and chat history to formulate a comprehensive answer. Always ground your response in the source material.""")

	# Create a human message template
	human_prompt = HumanMessagePromptTemplate.from_template("{question}")

	# Combine the prompts
	chat_prompt = ChatPromptTemplate.from_messages([
	system_prompt,
	human_prompt
	])

	chat_prompt = ChatPromptTemplate.from_messages([
	SystemMessagePromptTemplate.from_template("""

	You are an advanced PDF analysis AI assistant. Your key responsibilities are:
	- Provide precise and accurate answers based on the document contents
	- Extract relevant information directly from the uploaded PDFs
	- Maintain context from previous interactions
	- Prioritize clarity and factual accuracy in your responses

	Think step by step and answer the question. Your life depends on it. Be very careful and precise in answering the question. Assume you are giving a exam and more accurate the anwer you give more points you will get. Thus always try to give an answer without saying I don't know.
	Use the provided context and chat history to formulate a comprehensive answer. Always ground your response in the source material. Reply only in English.
	Context: {context}
	"""),
	HumanMessagePromptTemplate.from_template("{question}")
	])

	# Initialize session state variables
	if 'chat_history' not in st.session_state:
	st.session_state.chat_history = []
	if 'pdf_processed' not in st.session_state:
	st.session_state.pdf_processed = False
	if 'qa_chain' not in st.session_state:
	st.session_state.qa_chain = None
	if 'memory' not in st.session_state:
	st.session_state.memory = ConversationBufferMemory(
	memory_key="chat_history",
	return_messages=True,
	output_key='answer'
	)
	def processInput(question,client):
	prompt = f"""
	Given the user's question: {question}
	Expand and break down this question to include relevant context and key points that should be searched for.
	Return only the expanded question. The questions are related to an Financial organization Wells Fargo.
	"""
	completion = client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{"role": "system", "content": "Follow the instructions and reply politely"},
	{"role": "user", "content": "{}".format(prompt)}
	],
	max_tokens=4000,
	)

	print(completion.choices[0].message.content)
	return completion.choices[0].message.content

	# Function to process PDFs
	def process_pdfs(uploaded_files, openai_key):
	try:
	shutil.rmtree("Data")
	except:
	print("Not needed")

	all_documents = []

	# Process each uploaded PDF
	for uploaded_file in uploaded_files:
	# Save uploaded file temporarily
	with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
	temp_file.write(uploaded_file.read())
	temp_pdf_path = temp_file.name

	# Extract text from PDF with page tracking
	pdf_reader = process_pdf_with_ocr(temp_pdf_path,openai_key)
	for page_num in pdf_reader:
	page_text = pdf_reader[page_num]
	# Create a document with page number metadata
	doc = Document(
	page_content=page_text,
	metadata={'source': uploaded_file.name, 'page': page_num}
	)
	all_documents.append(doc)

	# Clean up temporary file
	os.unlink(temp_pdf_path)

	# Split documents into chunks while preserving metadata
	text_splitter = CharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=100,
	length_function=len,
	separator="\n"
	)
	split_docs = text_splitter.split_documents(all_documents)

	# Embed and create vector store
	embeddings = OpenAIEmbeddings(openai_api_key=openai_key, model="text-embedding-3-large")
	vector_store = Chroma.from_documents(split_docs, embedding=embeddings, persist_directory="Data")

	# Configure retriever with simpler settings
	retriever = vector_store.as_retriever(search_kwargs={"k": 10})

	# Set up QA chain with memory management
	llm = ChatOpenAI(
	temperature=0,
	openai_api_key=openai_key,
	model="gpt-4o-mini",
	max_tokens=500
	)

	qa_chain = ConversationalRetrievalChain.from_llm(
	llm=llm,
	retriever=retriever,
	memory=st.session_state.memory,
	return_source_documents=True,
	verbose=True,
	combine_docs_chain_kwargs={'prompt': chat_prompt}
	)

	return qa_chain

	# Function to manage chat history length
	def manage_chat_history():
	# Keep only the last 3 interactions to limit context
	if len(st.session_state.chat_history) > 3:
	st.session_state.chat_history = st.session_state.chat_history[-3:]

	# Sidebar for PDF upload
	if 'openai_key' not in st.session_state:
	st.session_state.openai_key = None

	with st.sidebar:
	st.header("Upload PDFs")
	uploaded_files = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True)

	if st.button("Clear Chat History"):
	st.session_state.chat_history = []
	st.session_state.memory.clear()
	st.success("Chat history cleared!")

	if uploaded_files and not st.session_state.pdf_processed:
	if not st.session_state.openai_key:
	st.session_state.openai_key = st.text_input("Enter OpenAI API Key:", type="password")

	if st.session_state.openai_key:
	os.environ["OPENAI_API_KEY"] = st.session_state.openai_key
	with st.spinner("Processing PDFs..."):
	try:
	st.session_state.qa_chain = process_pdfs(uploaded_files, st.session_state.openai_key)
	st.session_state.pdf_processed = True
	st.success(f"Processed {len(uploaded_files)} PDF(s) successfully!")
	except Exception as e:
	st.error(f"Error processing PDFs: {str(e)}")
	st.session_state.pdf_processed = False

	# Main chat interface
	if st.session_state.pdf_processed and st.session_state.qa_chain is not None:
	# Display chat history
	for idx, (question, answer) in enumerate(st.session_state.chat_history, 1):
	with st.chat_message("user"):
	st.write(f"Question {idx}: {question}")
	with st.chat_message("assistant"):
	st.write(f"Answer {idx}: {answer}")

	# Chat input
	if user_question := st.chat_input("Ask a question about the PDFs"):
	try:
	# Run QA chain with error handling
	client = OpenAI()
	expanded_query = processInput(user_question,client)
	result = st.session_state.qa_chain({
	"question": expanded_query,
	"chat_history": [] # Empty chat history to reduce tokens
	})
	answer = result['answer']

	# Update and display new interaction
	st.session_state.chat_history.append((user_question, answer))
	manage_chat_history() # Manage chat history length

	with st.chat_message("user"):
	st.write(f"Question: {user_question}")
	with st.chat_message("assistant"):
	st.write(f"Answer: {answer}")

	# Show source documents with page numbers
	with st.expander("Source Documents"):
	for idx, doc in enumerate(result['source_documents'], 1):
	st.write(f"Source {idx}:")
	st.write(f"File: {doc.metadata.get('source', 'Unknown')}")
	st.write(f"Page: {doc.metadata.get('page', 'N/A')}")
	st.text(doc.page_content)

	except Exception as e:
	st.error(f"Error processing question: {str(e)}")
	st.warning("Please try asking a shorter question or breaking it into multiple parts.")

	else:
	st.info("Please upload PDF files in the sidebar to start chatting.")