|
import streamlit as st |
|
from PyPDF2 import PdfReader |
|
from langchain_community.vectorstores import Chroma |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain.chat_models import ChatOpenAI |
|
from tempfile import NamedTemporaryFile |
|
import os |
|
from processPDF import process_pdf_with_ocr |
|
from langchain.embeddings import OpenAIEmbeddings |
|
from langchain.chains import ConversationalRetrievalChain |
|
from langchain.memory import ConversationBufferMemory |
|
from langchain.docstore.document import Document |
|
import shutil |
|
from langchain.prompts import ( |
|
ChatPromptTemplate, |
|
SystemMessagePromptTemplate, |
|
HumanMessagePromptTemplate |
|
) |
|
from langchain.memory import ConversationBufferMemory |
|
from langchain.prompts import PromptTemplate |
|
from openai import OpenAI |
|
|
|
|
|
st.set_page_config(page_title="Multi-PDF Chat", layout="wide") |
|
st.title("📄 Multi-PDF Question-Answering Chatbot") |
|
|
|
|
|
system_prompt = SystemMessagePromptTemplate.from_template(""" |
|
You are an advanced PDF analysis AI assistant. Your key responsibilities are: |
|
- Provide precise and accurate answers based on the document contents |
|
- Extract relevant information directly from the uploaded PDFs |
|
- Maintain context from previous interactions |
|
- Prioritize clarity and factual accuracy in your responses |
|
- Give a very detailed answer with a detailed explaination |
|
|
|
Think step by step and answer the question. Your life depends on it. Be very careful and precise in answering the question. Assume you are giving a exam and more accurate the anwer you give more points you will get. |
|
Use the provided context and chat history to formulate a comprehensive answer. Always ground your response in the source material.""") |
|
|
|
|
|
human_prompt = HumanMessagePromptTemplate.from_template("{question}") |
|
|
|
|
|
chat_prompt = ChatPromptTemplate.from_messages([ |
|
system_prompt, |
|
human_prompt |
|
]) |
|
|
|
chat_prompt = ChatPromptTemplate.from_messages([ |
|
SystemMessagePromptTemplate.from_template(""" |
|
|
|
You are an advanced PDF analysis AI assistant. Your key responsibilities are: |
|
- Provide precise and accurate answers based on the document contents |
|
- Extract relevant information directly from the uploaded PDFs |
|
- Maintain context from previous interactions |
|
- Prioritize clarity and factual accuracy in your responses |
|
|
|
Think step by step and answer the question. Your life depends on it. Be very careful and precise in answering the question. Assume you are giving a exam and more accurate the anwer you give more points you will get. Thus always try to give an answer without saying I don't know. |
|
Use the provided context and chat history to formulate a comprehensive answer. Always ground your response in the source material. Reply only in English. |
|
Context: {context} |
|
"""), |
|
HumanMessagePromptTemplate.from_template("{question}") |
|
]) |
|
|
|
|
|
if 'chat_history' not in st.session_state: |
|
st.session_state.chat_history = [] |
|
if 'pdf_processed' not in st.session_state: |
|
st.session_state.pdf_processed = False |
|
if 'qa_chain' not in st.session_state: |
|
st.session_state.qa_chain = None |
|
if 'memory' not in st.session_state: |
|
st.session_state.memory = ConversationBufferMemory( |
|
memory_key="chat_history", |
|
return_messages=True, |
|
output_key='answer' |
|
) |
|
def processInput(question,client): |
|
prompt = f""" |
|
Given the user's question: {question} |
|
Expand and break down this question to include relevant context and key points that should be searched for. |
|
Return only the expanded question. The questions are related to an Financial organization Wells Fargo. |
|
""" |
|
completion = client.chat.completions.create( |
|
model="gpt-4o-mini", |
|
messages=[ |
|
{"role": "system", "content": "Follow the instructions and reply politely"}, |
|
{"role": "user", "content": "{}".format(prompt)} |
|
], |
|
max_tokens=4000, |
|
) |
|
|
|
print(completion.choices[0].message.content) |
|
return completion.choices[0].message.content |
|
|
|
|
|
def process_pdfs(uploaded_files, openai_key): |
|
try: |
|
shutil.rmtree("Data") |
|
except: |
|
print("Not needed") |
|
|
|
all_documents = [] |
|
|
|
|
|
for uploaded_file in uploaded_files: |
|
|
|
with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: |
|
temp_file.write(uploaded_file.read()) |
|
temp_pdf_path = temp_file.name |
|
|
|
|
|
pdf_reader = process_pdf_with_ocr(temp_pdf_path,openai_key) |
|
for page_num in pdf_reader: |
|
page_text = pdf_reader[page_num] |
|
|
|
doc = Document( |
|
page_content=page_text, |
|
metadata={'source': uploaded_file.name, 'page': page_num} |
|
) |
|
all_documents.append(doc) |
|
|
|
|
|
os.unlink(temp_pdf_path) |
|
|
|
|
|
text_splitter = CharacterTextSplitter( |
|
chunk_size=1000, |
|
chunk_overlap=100, |
|
length_function=len, |
|
separator="\n" |
|
) |
|
split_docs = text_splitter.split_documents(all_documents) |
|
|
|
|
|
embeddings = OpenAIEmbeddings(openai_api_key=openai_key, model="text-embedding-3-large") |
|
vector_store = Chroma.from_documents(split_docs, embedding=embeddings, persist_directory="Data") |
|
|
|
|
|
retriever = vector_store.as_retriever(search_kwargs={"k": 10}) |
|
|
|
|
|
llm = ChatOpenAI( |
|
temperature=0, |
|
openai_api_key=openai_key, |
|
model="gpt-4o-mini", |
|
max_tokens=500 |
|
) |
|
|
|
qa_chain = ConversationalRetrievalChain.from_llm( |
|
llm=llm, |
|
retriever=retriever, |
|
memory=st.session_state.memory, |
|
return_source_documents=True, |
|
verbose=True, |
|
combine_docs_chain_kwargs={'prompt': chat_prompt} |
|
) |
|
|
|
return qa_chain |
|
|
|
|
|
def manage_chat_history(): |
|
|
|
if len(st.session_state.chat_history) > 3: |
|
st.session_state.chat_history = st.session_state.chat_history[-3:] |
|
|
|
|
|
if 'openai_key' not in st.session_state: |
|
st.session_state.openai_key = None |
|
|
|
with st.sidebar: |
|
st.header("Upload PDFs") |
|
uploaded_files = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True) |
|
|
|
if st.button("Clear Chat History"): |
|
st.session_state.chat_history = [] |
|
st.session_state.memory.clear() |
|
st.success("Chat history cleared!") |
|
|
|
if uploaded_files and not st.session_state.pdf_processed: |
|
if not st.session_state.openai_key: |
|
st.session_state.openai_key = st.text_input("Enter OpenAI API Key:", type="password") |
|
|
|
if st.session_state.openai_key: |
|
os.environ["OPENAI_API_KEY"] = st.session_state.openai_key |
|
with st.spinner("Processing PDFs..."): |
|
try: |
|
st.session_state.qa_chain = process_pdfs(uploaded_files, st.session_state.openai_key) |
|
st.session_state.pdf_processed = True |
|
st.success(f"Processed {len(uploaded_files)} PDF(s) successfully!") |
|
except Exception as e: |
|
st.error(f"Error processing PDFs: {str(e)}") |
|
st.session_state.pdf_processed = False |
|
|
|
|
|
if st.session_state.pdf_processed and st.session_state.qa_chain is not None: |
|
|
|
for idx, (question, answer) in enumerate(st.session_state.chat_history, 1): |
|
with st.chat_message("user"): |
|
st.write(f"**Question {idx}:** {question}") |
|
with st.chat_message("assistant"): |
|
st.write(f"**Answer {idx}:** {answer}") |
|
|
|
|
|
if user_question := st.chat_input("Ask a question about the PDFs"): |
|
try: |
|
|
|
client = OpenAI() |
|
expanded_query = processInput(user_question,client) |
|
result = st.session_state.qa_chain({ |
|
"question": expanded_query, |
|
"chat_history": [] |
|
}) |
|
answer = result['answer'] |
|
|
|
|
|
st.session_state.chat_history.append((user_question, answer)) |
|
manage_chat_history() |
|
|
|
with st.chat_message("user"): |
|
st.write(f"**Question:** {user_question}") |
|
with st.chat_message("assistant"): |
|
st.write(f"**Answer:** {answer}") |
|
|
|
|
|
with st.expander("Source Documents"): |
|
for idx, doc in enumerate(result['source_documents'], 1): |
|
st.write(f"**Source {idx}:**") |
|
st.write(f"File: {doc.metadata.get('source', 'Unknown')}") |
|
st.write(f"Page: {doc.metadata.get('page', 'N/A')}") |
|
st.text(doc.page_content) |
|
|
|
except Exception as e: |
|
st.error(f"Error processing question: {str(e)}") |
|
st.warning("Please try asking a shorter question or breaking it into multiple parts.") |
|
|
|
else: |
|
st.info("Please upload PDF files in the sidebar to start chatting.") |