Upload 3 files
Browse files- app.py +37 -11
- processPDF.py +183 -0
- requirements.txt +4 -1
app.py
CHANGED
@@ -5,6 +5,7 @@ from langchain.text_splitter import CharacterTextSplitter
|
|
5 |
from langchain.chat_models import ChatOpenAI
|
6 |
from tempfile import NamedTemporaryFile
|
7 |
import os
|
|
|
8 |
from langchain.embeddings import OpenAIEmbeddings
|
9 |
from langchain.chains import ConversationalRetrievalChain
|
10 |
from langchain.memory import ConversationBufferMemory
|
@@ -16,6 +17,8 @@ from langchain.prompts import (
|
|
16 |
HumanMessagePromptTemplate
|
17 |
)
|
18 |
from langchain.memory import ConversationBufferMemory
|
|
|
|
|
19 |
|
20 |
# Streamlit App Configuration
|
21 |
st.set_page_config(page_title="Multi-PDF Chat", layout="wide")
|
@@ -28,6 +31,7 @@ You are an advanced PDF analysis AI assistant. Your key responsibilities are:
|
|
28 |
- Extract relevant information directly from the uploaded PDFs
|
29 |
- Maintain context from previous interactions
|
30 |
- Prioritize clarity and factual accuracy in your responses
|
|
|
31 |
|
32 |
Think step by step and answer the question. Your life depends on it. Be very careful and precise in answering the question. Assume you are giving a exam and more accurate the anwer you give more points you will get.
|
33 |
Use the provided context and chat history to formulate a comprehensive answer. Always ground your response in the source material.""")
|
@@ -70,6 +74,23 @@ if 'memory' not in st.session_state:
|
|
70 |
return_messages=True,
|
71 |
output_key='answer'
|
72 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
# Function to process PDFs
|
75 |
def process_pdfs(uploaded_files, openai_key):
|
@@ -88,9 +109,9 @@ def process_pdfs(uploaded_files, openai_key):
|
|
88 |
temp_pdf_path = temp_file.name
|
89 |
|
90 |
# Extract text from PDF with page tracking
|
91 |
-
pdf_reader =
|
92 |
-
for page_num
|
93 |
-
page_text =
|
94 |
# Create a document with page number metadata
|
95 |
doc = Document(
|
96 |
page_content=page_text,
|
@@ -115,7 +136,7 @@ def process_pdfs(uploaded_files, openai_key):
|
|
115 |
vector_store = Chroma.from_documents(split_docs, embedding=embeddings, persist_directory="Data")
|
116 |
|
117 |
# Configure retriever with simpler settings
|
118 |
-
retriever = vector_store.as_retriever(search_kwargs={"k":
|
119 |
|
120 |
# Set up QA chain with memory management
|
121 |
llm = ChatOpenAI(
|
@@ -143,30 +164,33 @@ def manage_chat_history():
|
|
143 |
st.session_state.chat_history = st.session_state.chat_history[-3:]
|
144 |
|
145 |
# Sidebar for PDF upload
|
|
|
|
|
|
|
146 |
with st.sidebar:
|
147 |
st.header("Upload PDFs")
|
148 |
uploaded_files = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True)
|
149 |
|
150 |
-
# Clear chat button
|
151 |
if st.button("Clear Chat History"):
|
152 |
st.session_state.chat_history = []
|
153 |
st.session_state.memory.clear()
|
154 |
st.success("Chat history cleared!")
|
155 |
|
156 |
-
# Process PDFs if newly uploaded
|
157 |
if uploaded_files and not st.session_state.pdf_processed:
|
158 |
-
|
159 |
-
|
|
|
|
|
|
|
160 |
with st.spinner("Processing PDFs..."):
|
161 |
try:
|
162 |
-
st.session_state.qa_chain = process_pdfs(uploaded_files,
|
163 |
st.session_state.pdf_processed = True
|
164 |
st.success(f"Processed {len(uploaded_files)} PDF(s) successfully!")
|
165 |
except Exception as e:
|
166 |
st.error(f"Error processing PDFs: {str(e)}")
|
167 |
st.session_state.pdf_processed = False
|
168 |
|
169 |
-
|
170 |
# Main chat interface
|
171 |
if st.session_state.pdf_processed and st.session_state.qa_chain is not None:
|
172 |
# Display chat history
|
@@ -180,8 +204,10 @@ if st.session_state.pdf_processed and st.session_state.qa_chain is not None:
|
|
180 |
if user_question := st.chat_input("Ask a question about the PDFs"):
|
181 |
try:
|
182 |
# Run QA chain with error handling
|
|
|
|
|
183 |
result = st.session_state.qa_chain({
|
184 |
-
"question":
|
185 |
"chat_history": [] # Empty chat history to reduce tokens
|
186 |
})
|
187 |
answer = result['answer']
|
|
|
5 |
from langchain.chat_models import ChatOpenAI
|
6 |
from tempfile import NamedTemporaryFile
|
7 |
import os
|
8 |
+
from processPDF import process_pdf_with_ocr
|
9 |
from langchain.embeddings import OpenAIEmbeddings
|
10 |
from langchain.chains import ConversationalRetrievalChain
|
11 |
from langchain.memory import ConversationBufferMemory
|
|
|
17 |
HumanMessagePromptTemplate
|
18 |
)
|
19 |
from langchain.memory import ConversationBufferMemory
|
20 |
+
from langchain.prompts import PromptTemplate
|
21 |
+
from openai import OpenAI
|
22 |
|
23 |
# Streamlit App Configuration
|
24 |
st.set_page_config(page_title="Multi-PDF Chat", layout="wide")
|
|
|
31 |
- Extract relevant information directly from the uploaded PDFs
|
32 |
- Maintain context from previous interactions
|
33 |
- Prioritize clarity and factual accuracy in your responses
|
34 |
+
- Give a very detailed answer with a detailed explaination
|
35 |
|
36 |
Think step by step and answer the question. Your life depends on it. Be very careful and precise in answering the question. Assume you are giving a exam and more accurate the anwer you give more points you will get.
|
37 |
Use the provided context and chat history to formulate a comprehensive answer. Always ground your response in the source material.""")
|
|
|
74 |
return_messages=True,
|
75 |
output_key='answer'
|
76 |
)
|
77 |
+
def processInput(question,client):
|
78 |
+
prompt = f"""
|
79 |
+
Given the user's question: {question}
|
80 |
+
Expand and break down this question to include relevant context and key points that should be searched for.
|
81 |
+
Return only the expanded question. The questions are related to an Financial organization Wells Fargo.
|
82 |
+
"""
|
83 |
+
completion = client.chat.completions.create(
|
84 |
+
model="gpt-4o-mini",
|
85 |
+
messages=[
|
86 |
+
{"role": "system", "content": "Follow the instructions and reply politely"},
|
87 |
+
{"role": "user", "content": "{}".format(prompt)}
|
88 |
+
],
|
89 |
+
max_tokens=4000,
|
90 |
+
)
|
91 |
+
|
92 |
+
print(completion.choices[0].message.content)
|
93 |
+
return completion.choices[0].message.content
|
94 |
|
95 |
# Function to process PDFs
|
96 |
def process_pdfs(uploaded_files, openai_key):
|
|
|
109 |
temp_pdf_path = temp_file.name
|
110 |
|
111 |
# Extract text from PDF with page tracking
|
112 |
+
pdf_reader = process_pdf_with_ocr(temp_pdf_path,openai_key)
|
113 |
+
for page_num in pdf_reader:
|
114 |
+
page_text = pdf_reader[page_num]
|
115 |
# Create a document with page number metadata
|
116 |
doc = Document(
|
117 |
page_content=page_text,
|
|
|
136 |
vector_store = Chroma.from_documents(split_docs, embedding=embeddings, persist_directory="Data")
|
137 |
|
138 |
# Configure retriever with simpler settings
|
139 |
+
retriever = vector_store.as_retriever(search_kwargs={"k": 10})
|
140 |
|
141 |
# Set up QA chain with memory management
|
142 |
llm = ChatOpenAI(
|
|
|
164 |
st.session_state.chat_history = st.session_state.chat_history[-3:]
|
165 |
|
166 |
# Sidebar for PDF upload
|
167 |
+
if 'openai_key' not in st.session_state:
|
168 |
+
st.session_state.openai_key = None
|
169 |
+
|
170 |
with st.sidebar:
|
171 |
st.header("Upload PDFs")
|
172 |
uploaded_files = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True)
|
173 |
|
|
|
174 |
if st.button("Clear Chat History"):
|
175 |
st.session_state.chat_history = []
|
176 |
st.session_state.memory.clear()
|
177 |
st.success("Chat history cleared!")
|
178 |
|
|
|
179 |
if uploaded_files and not st.session_state.pdf_processed:
|
180 |
+
if not st.session_state.openai_key:
|
181 |
+
st.session_state.openai_key = st.text_input("Enter OpenAI API Key:", type="password")
|
182 |
+
|
183 |
+
if st.session_state.openai_key:
|
184 |
+
os.environ["OPENAI_API_KEY"] = st.session_state.openai_key
|
185 |
with st.spinner("Processing PDFs..."):
|
186 |
try:
|
187 |
+
st.session_state.qa_chain = process_pdfs(uploaded_files, st.session_state.openai_key)
|
188 |
st.session_state.pdf_processed = True
|
189 |
st.success(f"Processed {len(uploaded_files)} PDF(s) successfully!")
|
190 |
except Exception as e:
|
191 |
st.error(f"Error processing PDFs: {str(e)}")
|
192 |
st.session_state.pdf_processed = False
|
193 |
|
|
|
194 |
# Main chat interface
|
195 |
if st.session_state.pdf_processed and st.session_state.qa_chain is not None:
|
196 |
# Display chat history
|
|
|
204 |
if user_question := st.chat_input("Ask a question about the PDFs"):
|
205 |
try:
|
206 |
# Run QA chain with error handling
|
207 |
+
client = OpenAI()
|
208 |
+
expanded_query = processInput(user_question,client)
|
209 |
result = st.session_state.qa_chain({
|
210 |
+
"question": expanded_query,
|
211 |
"chat_history": [] # Empty chat history to reduce tokens
|
212 |
})
|
213 |
answer = result['answer']
|
processPDF.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import string
|
2 |
+
import random
|
3 |
+
import fitz
|
4 |
+
from PIL import Image as Img
|
5 |
+
import os
|
6 |
+
import shutil
|
7 |
+
import base64
|
8 |
+
from openai import OpenAI
|
9 |
+
|
10 |
+
import string
|
11 |
+
import random
|
12 |
+
import fitz
|
13 |
+
from PIL import Image as Img
|
14 |
+
import os
|
15 |
+
import tqdm
|
16 |
+
import shutil
|
17 |
+
import base64
|
18 |
+
from openai import OpenAI
|
19 |
+
import streamlit as st
|
20 |
+
|
21 |
+
def process_pdf_with_ocr(pdf_path, api_key):
|
22 |
+
def generate_random_string(length=10):
|
23 |
+
characters = string.ascii_letters + string.digits
|
24 |
+
return ''.join(random.choices(characters, k=length))
|
25 |
+
|
26 |
+
def encode_image(image_path):
|
27 |
+
with open(image_path, "rb") as image_file:
|
28 |
+
return base64.b64encode(image_file.read()).decode("utf-8")
|
29 |
+
|
30 |
+
def get_ocr_text(image_path, client, current_page, total_pages):
|
31 |
+
progress = (current_page / total_pages) * 100
|
32 |
+
status_text.text(f"Processing page {current_page}/{total_pages} with OCR")
|
33 |
+
progress_bar.progress(int(progress))
|
34 |
+
|
35 |
+
prompt = """
|
36 |
+
You are provided with an image that may contain handwritten text in a local Indian language or English, along with possible table structures. Your task is to extract all text using OCR, ensuring that:
|
37 |
+
- Regular text is returned as plain text.
|
38 |
+
- Any detected tables are reconstructed using proper markdown table formatting (using pipes "|" for columns and dashes "-" for row separators).
|
39 |
+
Return only the extracted text in markdown format, with no additional commentary. If no text is detected, return an empty response.
|
40 |
+
"""
|
41 |
+
|
42 |
+
base64_image = encode_image(image_path)
|
43 |
+
response = client.chat.completions.create(
|
44 |
+
model="gpt-4o",
|
45 |
+
messages=[{
|
46 |
+
"role": "user",
|
47 |
+
"content": [
|
48 |
+
{"type": "text", "text": prompt},
|
49 |
+
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
|
50 |
+
]
|
51 |
+
}]
|
52 |
+
)
|
53 |
+
return response.choices[0].message.content
|
54 |
+
|
55 |
+
# Initialize progress tracking
|
56 |
+
progress_bar = st.progress(0)
|
57 |
+
status_text = st.empty()
|
58 |
+
progress_info = st.empty()
|
59 |
+
|
60 |
+
# Initialize OpenAI client
|
61 |
+
status_text.text("Initializing OpenAI client...")
|
62 |
+
progress_bar.progress(5)
|
63 |
+
os.environ["OPENAI_API_KEY"] = api_key
|
64 |
+
client = OpenAI()
|
65 |
+
|
66 |
+
# Create temp folder for images
|
67 |
+
temp_folder = f"Images/{generate_random_string()}"
|
68 |
+
os.makedirs(temp_folder, exist_ok=True)
|
69 |
+
progress_bar.progress(10)
|
70 |
+
|
71 |
+
result = {}
|
72 |
+
try:
|
73 |
+
# Open PDF and get total pages
|
74 |
+
status_text.text("Opening PDF document...")
|
75 |
+
pdf_document = fitz.open(pdf_path)
|
76 |
+
total_pages = len(pdf_document)
|
77 |
+
progress_bar.progress(15)
|
78 |
+
|
79 |
+
# Convert PDF to images
|
80 |
+
for page_num in range(total_pages):
|
81 |
+
current_progress = 15 + (page_num / total_pages * 25) # 15-40% progress for PDF to image conversion
|
82 |
+
status_text.text(f"Converting page {page_num + 1}/{total_pages} to image")
|
83 |
+
progress_info.text(f"PDF to Image conversion: {int(current_progress)}%")
|
84 |
+
progress_bar.progress(int(current_progress))
|
85 |
+
|
86 |
+
page = pdf_document[page_num]
|
87 |
+
pix = page.get_pixmap(dpi=150)
|
88 |
+
image_path = f"{temp_folder}/page_{page_num + 1}.png"
|
89 |
+
image = Img.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
90 |
+
image.save(image_path)
|
91 |
+
|
92 |
+
# Process OCR for each image
|
93 |
+
status_text.text("Starting OCR processing...")
|
94 |
+
progress_bar.progress(40)
|
95 |
+
|
96 |
+
for page_num in range(total_pages):
|
97 |
+
current_progress = 40 + (page_num / total_pages * 55) # 40-95% progress for OCR
|
98 |
+
image_path = f"{temp_folder}/page_{page_num + 1}.png"
|
99 |
+
progress_info.text(f"OCR Processing: {int(current_progress)}%")
|
100 |
+
|
101 |
+
ocr_text = get_ocr_text(image_path, client, page_num + 1, total_pages)
|
102 |
+
result[page_num + 1] = ocr_text
|
103 |
+
|
104 |
+
pdf_document.close()
|
105 |
+
status_text.text("Finalizing...")
|
106 |
+
progress_bar.progress(95)
|
107 |
+
|
108 |
+
finally:
|
109 |
+
# Clean up
|
110 |
+
if os.path.exists(temp_folder):
|
111 |
+
status_text.text("Cleaning up temporary files...")
|
112 |
+
shutil.rmtree(temp_folder)
|
113 |
+
progress_bar.progress(100)
|
114 |
+
status_text.text("Processing complete!")
|
115 |
+
progress_info.empty()
|
116 |
+
|
117 |
+
return result
|
118 |
+
|
119 |
+
'''
|
120 |
+
def process_pdf_with_ocr(pdf_path, api_key):
|
121 |
+
def generate_random_string(length=10):
|
122 |
+
characters = string.ascii_letters + string.digits
|
123 |
+
return ''.join(random.choices(characters, k=length))
|
124 |
+
|
125 |
+
def encode_image(image_path):
|
126 |
+
with open(image_path, "rb") as image_file:
|
127 |
+
return base64.b64encode(image_file.read()).decode("utf-8")
|
128 |
+
|
129 |
+
def get_ocr_text(image_path, client):
|
130 |
+
prompt = """
|
131 |
+
You are provided with an image that may contain handwritten text in a local Indian language or English, along with possible table structures. Your task is to extract all text using OCR, ensuring that:
|
132 |
+
- Regular text is returned as plain text.
|
133 |
+
- Any detected tables are reconstructed using proper markdown table formatting (using pipes "|" for columns and dashes "-" for row separators).
|
134 |
+
Return only the extracted text in markdown format, with no additional commentary. If no text is detected, return an empty response.
|
135 |
+
"""
|
136 |
+
base64_image = encode_image(image_path)
|
137 |
+
response = client.chat.completions.create(
|
138 |
+
model="gpt-4o",
|
139 |
+
messages=[{
|
140 |
+
"role": "user",
|
141 |
+
"content": [
|
142 |
+
{"type": "text", "text": prompt},
|
143 |
+
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
|
144 |
+
]
|
145 |
+
}]
|
146 |
+
)
|
147 |
+
print(image_path)
|
148 |
+
print(response.choices[0].message.content)
|
149 |
+
return response.choices[0].message.content
|
150 |
+
|
151 |
+
# Initialize OpenAI client
|
152 |
+
os.environ["OPENAI_API_KEY"] = api_key
|
153 |
+
client = OpenAI()
|
154 |
+
|
155 |
+
# Create temp folder for images
|
156 |
+
temp_folder = f"Images/{generate_random_string()}"
|
157 |
+
os.makedirs(temp_folder, exist_ok=True)
|
158 |
+
|
159 |
+
# Process PDF
|
160 |
+
result = {}
|
161 |
+
try:
|
162 |
+
# Convert PDF to images
|
163 |
+
pdf_document = fitz.open(pdf_path)
|
164 |
+
for page_num in range(len(pdf_document)):
|
165 |
+
page = pdf_document[page_num]
|
166 |
+
pix = page.get_pixmap(dpi=150)
|
167 |
+
image_path = f"{temp_folder}/page_{page_num + 1}.png"
|
168 |
+
image = Img.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
169 |
+
image.save(image_path)
|
170 |
+
|
171 |
+
# Process each image with OCR
|
172 |
+
ocr_text = get_ocr_text(image_path, client)
|
173 |
+
result[page_num + 1] = ocr_text
|
174 |
+
|
175 |
+
pdf_document.close()
|
176 |
+
|
177 |
+
finally:
|
178 |
+
# Clean up temporary files
|
179 |
+
if os.path.exists(temp_folder):
|
180 |
+
shutil.rmtree(temp_folder)
|
181 |
+
|
182 |
+
return result
|
183 |
+
'''
|
requirements.txt
CHANGED
@@ -5,4 +5,7 @@ langchain==0.3.7
|
|
5 |
langchain-openai==0.2.6
|
6 |
langchain-chroma==0.1.4
|
7 |
langchain-text-splitters==0.3.2
|
8 |
-
chromadb==0.5.18
|
|
|
|
|
|
|
|
5 |
langchain-openai==0.2.6
|
6 |
langchain-chroma==0.1.4
|
7 |
langchain-text-splitters==0.3.2
|
8 |
+
chromadb==0.5.18
|
9 |
+
pymupdf==1.24.13
|
10 |
+
pillow==10.4.0
|
11 |
+
openai==1.54.3
|