OmkarGhugarkar commited on
Commit
3905e66
·
verified ·
1 Parent(s): 2e39a83

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +37 -11
  2. processPDF.py +183 -0
  3. requirements.txt +4 -1
app.py CHANGED
@@ -5,6 +5,7 @@ from langchain.text_splitter import CharacterTextSplitter
5
  from langchain.chat_models import ChatOpenAI
6
  from tempfile import NamedTemporaryFile
7
  import os
 
8
  from langchain.embeddings import OpenAIEmbeddings
9
  from langchain.chains import ConversationalRetrievalChain
10
  from langchain.memory import ConversationBufferMemory
@@ -16,6 +17,8 @@ from langchain.prompts import (
16
  HumanMessagePromptTemplate
17
  )
18
  from langchain.memory import ConversationBufferMemory
 
 
19
 
20
  # Streamlit App Configuration
21
  st.set_page_config(page_title="Multi-PDF Chat", layout="wide")
@@ -28,6 +31,7 @@ You are an advanced PDF analysis AI assistant. Your key responsibilities are:
28
  - Extract relevant information directly from the uploaded PDFs
29
  - Maintain context from previous interactions
30
  - Prioritize clarity and factual accuracy in your responses
 
31
 
32
  Think step by step and answer the question. Your life depends on it. Be very careful and precise in answering the question. Assume you are giving a exam and more accurate the anwer you give more points you will get.
33
  Use the provided context and chat history to formulate a comprehensive answer. Always ground your response in the source material.""")
@@ -70,6 +74,23 @@ if 'memory' not in st.session_state:
70
  return_messages=True,
71
  output_key='answer'
72
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  # Function to process PDFs
75
  def process_pdfs(uploaded_files, openai_key):
@@ -88,9 +109,9 @@ def process_pdfs(uploaded_files, openai_key):
88
  temp_pdf_path = temp_file.name
89
 
90
  # Extract text from PDF with page tracking
91
- pdf_reader = PdfReader(temp_pdf_path)
92
- for page_num, page in enumerate(pdf_reader.pages, 1):
93
- page_text = page.extract_text()
94
  # Create a document with page number metadata
95
  doc = Document(
96
  page_content=page_text,
@@ -115,7 +136,7 @@ def process_pdfs(uploaded_files, openai_key):
115
  vector_store = Chroma.from_documents(split_docs, embedding=embeddings, persist_directory="Data")
116
 
117
  # Configure retriever with simpler settings
118
- retriever = vector_store.as_retriever(search_kwargs={"k": 5})
119
 
120
  # Set up QA chain with memory management
121
  llm = ChatOpenAI(
@@ -143,30 +164,33 @@ def manage_chat_history():
143
  st.session_state.chat_history = st.session_state.chat_history[-3:]
144
 
145
  # Sidebar for PDF upload
 
 
 
146
  with st.sidebar:
147
  st.header("Upload PDFs")
148
  uploaded_files = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True)
149
 
150
- # Clear chat button
151
  if st.button("Clear Chat History"):
152
  st.session_state.chat_history = []
153
  st.session_state.memory.clear()
154
  st.success("Chat history cleared!")
155
 
156
- # Process PDFs if newly uploaded
157
  if uploaded_files and not st.session_state.pdf_processed:
158
- key = st.text_input("Enter OpenAI API Key:", type="password")
159
- if key:
 
 
 
160
  with st.spinner("Processing PDFs..."):
161
  try:
162
- st.session_state.qa_chain = process_pdfs(uploaded_files, key)
163
  st.session_state.pdf_processed = True
164
  st.success(f"Processed {len(uploaded_files)} PDF(s) successfully!")
165
  except Exception as e:
166
  st.error(f"Error processing PDFs: {str(e)}")
167
  st.session_state.pdf_processed = False
168
 
169
-
170
  # Main chat interface
171
  if st.session_state.pdf_processed and st.session_state.qa_chain is not None:
172
  # Display chat history
@@ -180,8 +204,10 @@ if st.session_state.pdf_processed and st.session_state.qa_chain is not None:
180
  if user_question := st.chat_input("Ask a question about the PDFs"):
181
  try:
182
  # Run QA chain with error handling
 
 
183
  result = st.session_state.qa_chain({
184
- "question": user_question,
185
  "chat_history": [] # Empty chat history to reduce tokens
186
  })
187
  answer = result['answer']
 
5
  from langchain.chat_models import ChatOpenAI
6
  from tempfile import NamedTemporaryFile
7
  import os
8
+ from processPDF import process_pdf_with_ocr
9
  from langchain.embeddings import OpenAIEmbeddings
10
  from langchain.chains import ConversationalRetrievalChain
11
  from langchain.memory import ConversationBufferMemory
 
17
  HumanMessagePromptTemplate
18
  )
19
  from langchain.memory import ConversationBufferMemory
20
+ from langchain.prompts import PromptTemplate
21
+ from openai import OpenAI
22
 
23
  # Streamlit App Configuration
24
  st.set_page_config(page_title="Multi-PDF Chat", layout="wide")
 
31
  - Extract relevant information directly from the uploaded PDFs
32
  - Maintain context from previous interactions
33
  - Prioritize clarity and factual accuracy in your responses
34
+ - Give a very detailed answer with a detailed explaination
35
 
36
  Think step by step and answer the question. Your life depends on it. Be very careful and precise in answering the question. Assume you are giving a exam and more accurate the anwer you give more points you will get.
37
  Use the provided context and chat history to formulate a comprehensive answer. Always ground your response in the source material.""")
 
74
  return_messages=True,
75
  output_key='answer'
76
  )
77
+ def processInput(question,client):
78
+ prompt = f"""
79
+ Given the user's question: {question}
80
+ Expand and break down this question to include relevant context and key points that should be searched for.
81
+ Return only the expanded question. The questions are related to an Financial organization Wells Fargo.
82
+ """
83
+ completion = client.chat.completions.create(
84
+ model="gpt-4o-mini",
85
+ messages=[
86
+ {"role": "system", "content": "Follow the instructions and reply politely"},
87
+ {"role": "user", "content": "{}".format(prompt)}
88
+ ],
89
+ max_tokens=4000,
90
+ )
91
+
92
+ print(completion.choices[0].message.content)
93
+ return completion.choices[0].message.content
94
 
95
  # Function to process PDFs
96
  def process_pdfs(uploaded_files, openai_key):
 
109
  temp_pdf_path = temp_file.name
110
 
111
  # Extract text from PDF with page tracking
112
+ pdf_reader = process_pdf_with_ocr(temp_pdf_path,openai_key)
113
+ for page_num in pdf_reader:
114
+ page_text = pdf_reader[page_num]
115
  # Create a document with page number metadata
116
  doc = Document(
117
  page_content=page_text,
 
136
  vector_store = Chroma.from_documents(split_docs, embedding=embeddings, persist_directory="Data")
137
 
138
  # Configure retriever with simpler settings
139
+ retriever = vector_store.as_retriever(search_kwargs={"k": 10})
140
 
141
  # Set up QA chain with memory management
142
  llm = ChatOpenAI(
 
164
  st.session_state.chat_history = st.session_state.chat_history[-3:]
165
 
166
  # Sidebar for PDF upload
167
+ if 'openai_key' not in st.session_state:
168
+ st.session_state.openai_key = None
169
+
170
  with st.sidebar:
171
  st.header("Upload PDFs")
172
  uploaded_files = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True)
173
 
 
174
  if st.button("Clear Chat History"):
175
  st.session_state.chat_history = []
176
  st.session_state.memory.clear()
177
  st.success("Chat history cleared!")
178
 
 
179
  if uploaded_files and not st.session_state.pdf_processed:
180
+ if not st.session_state.openai_key:
181
+ st.session_state.openai_key = st.text_input("Enter OpenAI API Key:", type="password")
182
+
183
+ if st.session_state.openai_key:
184
+ os.environ["OPENAI_API_KEY"] = st.session_state.openai_key
185
  with st.spinner("Processing PDFs..."):
186
  try:
187
+ st.session_state.qa_chain = process_pdfs(uploaded_files, st.session_state.openai_key)
188
  st.session_state.pdf_processed = True
189
  st.success(f"Processed {len(uploaded_files)} PDF(s) successfully!")
190
  except Exception as e:
191
  st.error(f"Error processing PDFs: {str(e)}")
192
  st.session_state.pdf_processed = False
193
 
 
194
  # Main chat interface
195
  if st.session_state.pdf_processed and st.session_state.qa_chain is not None:
196
  # Display chat history
 
204
  if user_question := st.chat_input("Ask a question about the PDFs"):
205
  try:
206
  # Run QA chain with error handling
207
+ client = OpenAI()
208
+ expanded_query = processInput(user_question,client)
209
  result = st.session_state.qa_chain({
210
+ "question": expanded_query,
211
  "chat_history": [] # Empty chat history to reduce tokens
212
  })
213
  answer = result['answer']
processPDF.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+ import random
3
+ import fitz
4
+ from PIL import Image as Img
5
+ import os
6
+ import shutil
7
+ import base64
8
+ from openai import OpenAI
9
+
10
+ import string
11
+ import random
12
+ import fitz
13
+ from PIL import Image as Img
14
+ import os
15
+ import tqdm
16
+ import shutil
17
+ import base64
18
+ from openai import OpenAI
19
+ import streamlit as st
20
+
21
+ def process_pdf_with_ocr(pdf_path, api_key):
22
+ def generate_random_string(length=10):
23
+ characters = string.ascii_letters + string.digits
24
+ return ''.join(random.choices(characters, k=length))
25
+
26
+ def encode_image(image_path):
27
+ with open(image_path, "rb") as image_file:
28
+ return base64.b64encode(image_file.read()).decode("utf-8")
29
+
30
+ def get_ocr_text(image_path, client, current_page, total_pages):
31
+ progress = (current_page / total_pages) * 100
32
+ status_text.text(f"Processing page {current_page}/{total_pages} with OCR")
33
+ progress_bar.progress(int(progress))
34
+
35
+ prompt = """
36
+ You are provided with an image that may contain handwritten text in a local Indian language or English, along with possible table structures. Your task is to extract all text using OCR, ensuring that:
37
+ - Regular text is returned as plain text.
38
+ - Any detected tables are reconstructed using proper markdown table formatting (using pipes "|" for columns and dashes "-" for row separators).
39
+ Return only the extracted text in markdown format, with no additional commentary. If no text is detected, return an empty response.
40
+ """
41
+
42
+ base64_image = encode_image(image_path)
43
+ response = client.chat.completions.create(
44
+ model="gpt-4o",
45
+ messages=[{
46
+ "role": "user",
47
+ "content": [
48
+ {"type": "text", "text": prompt},
49
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
50
+ ]
51
+ }]
52
+ )
53
+ return response.choices[0].message.content
54
+
55
+ # Initialize progress tracking
56
+ progress_bar = st.progress(0)
57
+ status_text = st.empty()
58
+ progress_info = st.empty()
59
+
60
+ # Initialize OpenAI client
61
+ status_text.text("Initializing OpenAI client...")
62
+ progress_bar.progress(5)
63
+ os.environ["OPENAI_API_KEY"] = api_key
64
+ client = OpenAI()
65
+
66
+ # Create temp folder for images
67
+ temp_folder = f"Images/{generate_random_string()}"
68
+ os.makedirs(temp_folder, exist_ok=True)
69
+ progress_bar.progress(10)
70
+
71
+ result = {}
72
+ try:
73
+ # Open PDF and get total pages
74
+ status_text.text("Opening PDF document...")
75
+ pdf_document = fitz.open(pdf_path)
76
+ total_pages = len(pdf_document)
77
+ progress_bar.progress(15)
78
+
79
+ # Convert PDF to images
80
+ for page_num in range(total_pages):
81
+ current_progress = 15 + (page_num / total_pages * 25) # 15-40% progress for PDF to image conversion
82
+ status_text.text(f"Converting page {page_num + 1}/{total_pages} to image")
83
+ progress_info.text(f"PDF to Image conversion: {int(current_progress)}%")
84
+ progress_bar.progress(int(current_progress))
85
+
86
+ page = pdf_document[page_num]
87
+ pix = page.get_pixmap(dpi=150)
88
+ image_path = f"{temp_folder}/page_{page_num + 1}.png"
89
+ image = Img.frombytes("RGB", [pix.width, pix.height], pix.samples)
90
+ image.save(image_path)
91
+
92
+ # Process OCR for each image
93
+ status_text.text("Starting OCR processing...")
94
+ progress_bar.progress(40)
95
+
96
+ for page_num in range(total_pages):
97
+ current_progress = 40 + (page_num / total_pages * 55) # 40-95% progress for OCR
98
+ image_path = f"{temp_folder}/page_{page_num + 1}.png"
99
+ progress_info.text(f"OCR Processing: {int(current_progress)}%")
100
+
101
+ ocr_text = get_ocr_text(image_path, client, page_num + 1, total_pages)
102
+ result[page_num + 1] = ocr_text
103
+
104
+ pdf_document.close()
105
+ status_text.text("Finalizing...")
106
+ progress_bar.progress(95)
107
+
108
+ finally:
109
+ # Clean up
110
+ if os.path.exists(temp_folder):
111
+ status_text.text("Cleaning up temporary files...")
112
+ shutil.rmtree(temp_folder)
113
+ progress_bar.progress(100)
114
+ status_text.text("Processing complete!")
115
+ progress_info.empty()
116
+
117
+ return result
118
+
119
+ '''
120
+ def process_pdf_with_ocr(pdf_path, api_key):
121
+ def generate_random_string(length=10):
122
+ characters = string.ascii_letters + string.digits
123
+ return ''.join(random.choices(characters, k=length))
124
+
125
+ def encode_image(image_path):
126
+ with open(image_path, "rb") as image_file:
127
+ return base64.b64encode(image_file.read()).decode("utf-8")
128
+
129
+ def get_ocr_text(image_path, client):
130
+ prompt = """
131
+ You are provided with an image that may contain handwritten text in a local Indian language or English, along with possible table structures. Your task is to extract all text using OCR, ensuring that:
132
+ - Regular text is returned as plain text.
133
+ - Any detected tables are reconstructed using proper markdown table formatting (using pipes "|" for columns and dashes "-" for row separators).
134
+ Return only the extracted text in markdown format, with no additional commentary. If no text is detected, return an empty response.
135
+ """
136
+ base64_image = encode_image(image_path)
137
+ response = client.chat.completions.create(
138
+ model="gpt-4o",
139
+ messages=[{
140
+ "role": "user",
141
+ "content": [
142
+ {"type": "text", "text": prompt},
143
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
144
+ ]
145
+ }]
146
+ )
147
+ print(image_path)
148
+ print(response.choices[0].message.content)
149
+ return response.choices[0].message.content
150
+
151
+ # Initialize OpenAI client
152
+ os.environ["OPENAI_API_KEY"] = api_key
153
+ client = OpenAI()
154
+
155
+ # Create temp folder for images
156
+ temp_folder = f"Images/{generate_random_string()}"
157
+ os.makedirs(temp_folder, exist_ok=True)
158
+
159
+ # Process PDF
160
+ result = {}
161
+ try:
162
+ # Convert PDF to images
163
+ pdf_document = fitz.open(pdf_path)
164
+ for page_num in range(len(pdf_document)):
165
+ page = pdf_document[page_num]
166
+ pix = page.get_pixmap(dpi=150)
167
+ image_path = f"{temp_folder}/page_{page_num + 1}.png"
168
+ image = Img.frombytes("RGB", [pix.width, pix.height], pix.samples)
169
+ image.save(image_path)
170
+
171
+ # Process each image with OCR
172
+ ocr_text = get_ocr_text(image_path, client)
173
+ result[page_num + 1] = ocr_text
174
+
175
+ pdf_document.close()
176
+
177
+ finally:
178
+ # Clean up temporary files
179
+ if os.path.exists(temp_folder):
180
+ shutil.rmtree(temp_folder)
181
+
182
+ return result
183
+ '''
requirements.txt CHANGED
@@ -5,4 +5,7 @@ langchain==0.3.7
5
  langchain-openai==0.2.6
6
  langchain-chroma==0.1.4
7
  langchain-text-splitters==0.3.2
8
- chromadb==0.5.18
 
 
 
 
5
  langchain-openai==0.2.6
6
  langchain-chroma==0.1.4
7
  langchain-text-splitters==0.3.2
8
+ chromadb==0.5.18
9
+ pymupdf==1.24.13
10
+ pillow==10.4.0
11
+ openai==1.54.3