Spaces:

OmkarGhugarkar
/

Multi-File-PDF-Chat-Application

Sleeping

App Files Files Community

OmkarGhugarkar commited on Feb 9

Commit

3905e66

verified ·

1 Parent(s): 2e39a83

Upload 3 files

Browse files

Files changed (3) hide show

app.py +37 -11
processPDF.py +183 -0
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ from langchain.text_splitter import CharacterTextSplitter
 from langchain.chat_models import ChatOpenAI
 from tempfile import NamedTemporaryFile
 import os
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ConversationBufferMemory
@@ -16,6 +17,8 @@ from langchain.prompts import (
     HumanMessagePromptTemplate
 )
 from langchain.memory import ConversationBufferMemory
 # Streamlit App Configuration
 st.set_page_config(page_title="Multi-PDF Chat", layout="wide")
@@ -28,6 +31,7 @@ You are an advanced PDF analysis AI assistant. Your key responsibilities are:
 - Extract relevant information directly from the uploaded PDFs
 - Maintain context from previous interactions
 - Prioritize clarity and factual accuracy in your responses
 Think step by step and answer the question. Your life depends on it. Be very careful and precise in answering the question. Assume you are giving a exam and more accurate the anwer you give more points you will get.
 Use the provided context and chat history to formulate a comprehensive answer. Always ground your response in the source material.""")
@@ -70,6 +74,23 @@ if 'memory' not in st.session_state:
         return_messages=True,
         output_key='answer'
     )
 # Function to process PDFs
 def process_pdfs(uploaded_files, openai_key):
@@ -88,9 +109,9 @@ def process_pdfs(uploaded_files, openai_key):
             temp_pdf_path = temp_file.name
         # Extract text from PDF with page tracking
-        pdf_reader = PdfReader(temp_pdf_path)
-        for page_num, page in enumerate(pdf_reader.pages, 1):
-            page_text = page.extract_text()
             # Create a document with page number metadata
             doc = Document(
                 page_content=page_text,
@@ -115,7 +136,7 @@ def process_pdfs(uploaded_files, openai_key):
     vector_store = Chroma.from_documents(split_docs, embedding=embeddings, persist_directory="Data")
     # Configure retriever with simpler settings
-    retriever = vector_store.as_retriever(search_kwargs={"k": 5})
     # Set up QA chain with memory management
     llm = ChatOpenAI(
@@ -143,30 +164,33 @@ def manage_chat_history():
         st.session_state.chat_history = st.session_state.chat_history[-3:]
 # Sidebar for PDF upload
 with st.sidebar:
     st.header("Upload PDFs")
     uploaded_files = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True)
-    # Clear chat button
     if st.button("Clear Chat History"):
         st.session_state.chat_history = []
         st.session_state.memory.clear()
         st.success("Chat history cleared!")
-    # Process PDFs if newly uploaded
     if uploaded_files and not st.session_state.pdf_processed:
-        key = st.text_input("Enter OpenAI API Key:", type="password")
-        if key:
             with st.spinner("Processing PDFs..."):
                 try:
-                    st.session_state.qa_chain = process_pdfs(uploaded_files, key)
                     st.session_state.pdf_processed = True
                     st.success(f"Processed {len(uploaded_files)} PDF(s) successfully!")
                 except Exception as e:
                     st.error(f"Error processing PDFs: {str(e)}")
                     st.session_state.pdf_processed = False
 # Main chat interface
 if st.session_state.pdf_processed and st.session_state.qa_chain is not None:
     # Display chat history
@@ -180,8 +204,10 @@ if st.session_state.pdf_processed and st.session_state.qa_chain is not None:
     if user_question := st.chat_input("Ask a question about the PDFs"):
         try:
             # Run QA chain with error handling
             result = st.session_state.qa_chain({
-                "question": user_question,
                 "chat_history": []  # Empty chat history to reduce tokens
             })
             answer = result['answer']

 from langchain.chat_models import ChatOpenAI
 from tempfile import NamedTemporaryFile
 import os
+from processPDF import process_pdf_with_ocr
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ConversationBufferMemory
     HumanMessagePromptTemplate
 )
 from langchain.memory import ConversationBufferMemory
+from langchain.prompts import PromptTemplate
+from openai import OpenAI
 # Streamlit App Configuration
 st.set_page_config(page_title="Multi-PDF Chat", layout="wide")
 - Extract relevant information directly from the uploaded PDFs
 - Maintain context from previous interactions
 - Prioritize clarity and factual accuracy in your responses
+- Give a very detailed answer with a detailed explaination
 Think step by step and answer the question. Your life depends on it. Be very careful and precise in answering the question. Assume you are giving a exam and more accurate the anwer you give more points you will get.
 Use the provided context and chat history to formulate a comprehensive answer. Always ground your response in the source material.""")
         return_messages=True,
         output_key='answer'
     )
+def processInput(question,client):
+    prompt = f"""
+    Given the user's question: {question}
+    Expand and break down this question to include relevant context and key points that should be searched for.
+    Return only the expanded question. The questions are related to an Financial organization Wells Fargo.
+    """
+    completion = client.chat.completions.create(
+    model="gpt-4o-mini",
+    messages=[
+    {"role": "system", "content": "Follow the instructions and reply politely"},
+    {"role": "user", "content": "{}".format(prompt)}
+    ],
+    max_tokens=4000,
+    )
+    print(completion.choices[0].message.content)
+    return completion.choices[0].message.content
 # Function to process PDFs
 def process_pdfs(uploaded_files, openai_key):
             temp_pdf_path = temp_file.name
         # Extract text from PDF with page tracking
+        pdf_reader = process_pdf_with_ocr(temp_pdf_path,openai_key)
+        for page_num in pdf_reader:
+            page_text = pdf_reader[page_num]
             # Create a document with page number metadata
             doc = Document(
                 page_content=page_text,
     vector_store = Chroma.from_documents(split_docs, embedding=embeddings, persist_directory="Data")
     # Configure retriever with simpler settings
+    retriever = vector_store.as_retriever(search_kwargs={"k": 10})
     # Set up QA chain with memory management
     llm = ChatOpenAI(
         st.session_state.chat_history = st.session_state.chat_history[-3:]
 # Sidebar for PDF upload
+if 'openai_key' not in st.session_state:
+    st.session_state.openai_key = None
 with st.sidebar:
     st.header("Upload PDFs")
     uploaded_files = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True)
     if st.button("Clear Chat History"):
         st.session_state.chat_history = []
         st.session_state.memory.clear()
         st.success("Chat history cleared!")
     if uploaded_files and not st.session_state.pdf_processed:
+        if not st.session_state.openai_key:
+            st.session_state.openai_key = st.text_input("Enter OpenAI API Key:", type="password")
+        if st.session_state.openai_key:
+            os.environ["OPENAI_API_KEY"] = st.session_state.openai_key
             with st.spinner("Processing PDFs..."):
                 try:
+                    st.session_state.qa_chain = process_pdfs(uploaded_files, st.session_state.openai_key)
                     st.session_state.pdf_processed = True
                     st.success(f"Processed {len(uploaded_files)} PDF(s) successfully!")
                 except Exception as e:
                     st.error(f"Error processing PDFs: {str(e)}")
                     st.session_state.pdf_processed = False
 # Main chat interface
 if st.session_state.pdf_processed and st.session_state.qa_chain is not None:
     # Display chat history
     if user_question := st.chat_input("Ask a question about the PDFs"):
         try:
             # Run QA chain with error handling
+            client = OpenAI()
+            expanded_query = processInput(user_question,client)
             result = st.session_state.qa_chain({
+                "question": expanded_query,
                 "chat_history": []  # Empty chat history to reduce tokens
             })
             answer = result['answer']

processPDF.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import string
+import random
+import fitz
+from PIL import Image as Img
+import os
+import shutil
+import base64
+from openai import OpenAI
+import string
+import random
+import fitz
+from PIL import Image as Img
+import os
+import tqdm
+import shutil
+import base64
+from openai import OpenAI
+import streamlit as st
+def process_pdf_with_ocr(pdf_path, api_key):
+    def generate_random_string(length=10):
+        characters = string.ascii_letters + string.digits
+        return ''.join(random.choices(characters, k=length))
+    def encode_image(image_path):
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode("utf-8")
+    def get_ocr_text(image_path, client, current_page, total_pages):
+        progress = (current_page / total_pages) * 100
+        status_text.text(f"Processing page {current_page}/{total_pages} with OCR")
+        progress_bar.progress(int(progress))
+        prompt = """
+        You are provided with an image that may contain handwritten text in a local Indian language or English, along with possible table structures. Your task is to extract all text using OCR, ensuring that:
+        - Regular text is returned as plain text.
+        - Any detected tables are reconstructed using proper markdown table formatting (using pipes "|" for columns and dashes "-" for row separators).
+        Return only the extracted text in markdown format, with no additional commentary. If no text is detected, return an empty response.
+        """
+        base64_image = encode_image(image_path)
+        response = client.chat.completions.create(
+            model="gpt-4o",
+            messages=[{
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
+                ]
+            }]
+        )
+        return response.choices[0].message.content
+    # Initialize progress tracking
+    progress_bar = st.progress(0)
+    status_text = st.empty()
+    progress_info = st.empty()
+    # Initialize OpenAI client
+    status_text.text("Initializing OpenAI client...")
+    progress_bar.progress(5)
+    os.environ["OPENAI_API_KEY"] = api_key
+    client = OpenAI()
+    # Create temp folder for images
+    temp_folder = f"Images/{generate_random_string()}"
+    os.makedirs(temp_folder, exist_ok=True)
+    progress_bar.progress(10)
+    result = {}
+    try:
+        # Open PDF and get total pages
+        status_text.text("Opening PDF document...")
+        pdf_document = fitz.open(pdf_path)
+        total_pages = len(pdf_document)
+        progress_bar.progress(15)
+        # Convert PDF to images
+        for page_num in range(total_pages):
+            current_progress = 15 + (page_num / total_pages * 25)  # 15-40% progress for PDF to image conversion
+            status_text.text(f"Converting page {page_num + 1}/{total_pages} to image")
+            progress_info.text(f"PDF to Image conversion: {int(current_progress)}%")
+            progress_bar.progress(int(current_progress))
+            page = pdf_document[page_num]
+            pix = page.get_pixmap(dpi=150)
+            image_path = f"{temp_folder}/page_{page_num + 1}.png"
+            image = Img.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            image.save(image_path)
+        # Process OCR for each image
+        status_text.text("Starting OCR processing...")
+        progress_bar.progress(40)
+        for page_num in range(total_pages):
+            current_progress = 40 + (page_num / total_pages * 55)  # 40-95% progress for OCR
+            image_path = f"{temp_folder}/page_{page_num + 1}.png"
+            progress_info.text(f"OCR Processing: {int(current_progress)}%")
+            ocr_text = get_ocr_text(image_path, client, page_num + 1, total_pages)
+            result[page_num + 1] = ocr_text
+        pdf_document.close()
+        status_text.text("Finalizing...")
+        progress_bar.progress(95)
+    finally:
+        # Clean up
+        if os.path.exists(temp_folder):
+            status_text.text("Cleaning up temporary files...")
+            shutil.rmtree(temp_folder)
+            progress_bar.progress(100)
+            status_text.text("Processing complete!")
+            progress_info.empty()
+    return result
+'''
+def process_pdf_with_ocr(pdf_path, api_key):
+    def generate_random_string(length=10):
+        characters = string.ascii_letters + string.digits
+        return ''.join(random.choices(characters, k=length))
+    def encode_image(image_path):
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode("utf-8")
+    def get_ocr_text(image_path, client):
+        prompt = """
+        You are provided with an image that may contain handwritten text in a local Indian language or English, along with possible table structures. Your task is to extract all text using OCR, ensuring that:
+        - Regular text is returned as plain text.
+        - Any detected tables are reconstructed using proper markdown table formatting (using pipes "|" for columns and dashes "-" for row separators).
+        Return only the extracted text in markdown format, with no additional commentary. If no text is detected, return an empty response.
+        """
+        base64_image = encode_image(image_path)
+        response = client.chat.completions.create(
+            model="gpt-4o",
+            messages=[{
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
+                ]
+            }]
+        )
+        print(image_path)
+        print(response.choices[0].message.content)
+        return response.choices[0].message.content
+    # Initialize OpenAI client
+    os.environ["OPENAI_API_KEY"] = api_key
+    client = OpenAI()
+    # Create temp folder for images
+    temp_folder = f"Images/{generate_random_string()}"
+    os.makedirs(temp_folder, exist_ok=True)
+    # Process PDF
+    result = {}
+    try:
+        # Convert PDF to images
+        pdf_document = fitz.open(pdf_path)
+        for page_num in range(len(pdf_document)):
+            page = pdf_document[page_num]
+            pix = page.get_pixmap(dpi=150)
+            image_path = f"{temp_folder}/page_{page_num + 1}.png"
+            image = Img.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            image.save(image_path)
+            # Process each image with OCR
+            ocr_text = get_ocr_text(image_path, client)
+            result[page_num + 1] = ocr_text
+        pdf_document.close()
+    finally:
+        # Clean up temporary files
+        if os.path.exists(temp_folder):
+            shutil.rmtree(temp_folder)
+    return result
+    '''

requirements.txt CHANGED Viewed

@@ -5,4 +5,7 @@ langchain==0.3.7
 langchain-openai==0.2.6
 langchain-chroma==0.1.4
 langchain-text-splitters==0.3.2
-chromadb==0.5.18

 langchain-openai==0.2.6
 langchain-chroma==0.1.4
 langchain-text-splitters==0.3.2
+chromadb==0.5.18
+pymupdf==1.24.13
+pillow==10.4.0
+openai==1.54.3