Spaces:

aaporosh
/

SmartPDF_Q_A

Running

App Files Files Community

aaporosh commited on 2 days ago

Commit

11694c7

verified ·

1 Parent(s): fecb449

Update app.py

Browse files

Files changed (1) hide show

app.py +146 -82

app.py CHANGED Viewed

@@ -1,93 +1,157 @@
 import streamlit as st
-import pdfplumber
-import re
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.vectorstores import FAISS
-from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.chat_models import ChatOpenAI
-from langchain.chains import ConversationalRetrievalChain
 from transformers import pipeline
-# -------------------- PAGE CONFIG --------------------
-st.set_page_config(page_title="Smart PDF Chatbot", layout="wide")
-# -------------------- MODELS --------------------
-@st.cache_resource
-def load_models():
-    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
-    return embeddings, summarizer
-embeddings, summarizer = load_models()
-# -------------------- TITLE --------------------
-st.title("📄 Smart PDF Chatbot & Summarizer")
-# -------------------- UPLOAD PDF --------------------
-uploaded_file = st.file_uploader("📤 Upload your PDF file", type=["pdf"])
-if uploaded_file:
-    # Extract text from PDF
-    with pdfplumber.open(uploaded_file) as pdf:
-        text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
-    if not text.strip():
-        st.error("⚠️ Could not extract text from this PDF.")
-    else:
-        # Split into chunks for better retrieval
-        splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-        chunks = splitter.split_text(text)
-        # Build vector store for retrieval
-        vector_store = FAISS.from_texts(chunks, embedding=embeddings)
-        retriever = vector_store.as_retriever()
-        # Create conversational chain with memory
-        llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
-        qa_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever)
-        # Tabs for Chat, Summary, and Code
-        tabs = st.tabs(["💬 Chat with PDF", "📝 Summarize PDF", "💻 Extract Code"])
-        # -------------------- CHAT TAB --------------------
-        with tabs[0]:
-            st.subheader("Ask Questions About Your PDF")
-            if "chat_history" not in st.session_state:
-                st.session_state.chat_history = []
-            user_input = st.text_input("Enter your question:", key="chat_input")
-            if st.button("Send"):
-                result = qa_chain({"question": user_input, "chat_history": st.session_state.chat_history})
-                st.session_state.chat_history.append((user_input, result["answer"]))
-            for q, a in st.session_state.chat_history:
-                st.markdown(f"**You:** {q}")
-                st.markdown(f"**Bot:** {a}")
-        # -------------------- SUMMARY TAB --------------------
-        with tabs[1]:
-            st.subheader("📘 PDF Summary")
-            if st.button("Generate Summary", key="sum"):
                 try:
-                    # Summarize in chunks for long PDFs
-                    summaries = []
-                    for i in range(0, len(chunks), 3):
-                        chunk_text = " ".join(chunks[i:i+3])
-                        summary = summarizer(chunk_text, max_length=150, min_length=30, do_sample=False)
-                        summaries.append(summary[0]['summary_text'])
-                    final_summary = " ".join(summaries)
-                    st.info(final_summary)
-                except Exception as e:
-                    st.error(f"Summarization error: {e}")
-        # -------------------- CODE EXTRACTION TAB --------------------
-        with tabs[2]:
-            st.subheader("🧑‍💻 Extracted Code Blocks")
-            code_blocks = re.findall(r"```[a-zA-Z]*([\s\S]*?)```", text)
-            if code_blocks:
-                for idx, code in enumerate(code_blocks, 1):
-                    st.code(code, language="python")
-            else:
-                st.warning("No code blocks found in this PDF.")
-else:
-    st.info("👆 Please upload a PDF to get started.")

+# ------------- app.py -------------
 import streamlit as st
+from pathlib import Path
+from io import BytesIO
+import pdfplumber, pytesseract, time, re, logging, os
+from PIL import Image
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import FAISS
+from sentence_transformers import SentenceTransformer
 from transformers import pipeline
+import numpy as np
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+###############################################################################
+# Page layout
+###############################################################################
+st.set_page_config(page_title="PDF Chat & Summarize", layout="wide")
+st.markdown("""
+<style>
+    .block-container { padding-top: 1rem; padding-bottom: 0; }
+    .stTabs [data-baseweb="tab-list"] { gap: 4px; }
+    .stTabs [data-baseweb="tab"] { padding: 8px 24px; }
+    .chat-msg { padding: 0.5rem 1rem; border-radius: 8px; margin: 0.3rem 0; }
+    .user   { background-color: #e3f2fd; margin-left: 20%; }
+    .assistant { background-color: #f1f3f4; margin-right: 20%; }
+</style>
+""", unsafe_allow_html=True)
+###############################################################################
+# Cached heavy objects
+###############################################################################
+@st.cache_resource(show_spinner=False)
+def load_embed():
+    return SentenceTransformer("all-MiniLM-L6-v2")
+@st.cache_resource(show_spinner=False)
+def load_qa():
+    return pipeline("text2text-generation", model="google/flan-t5-large", max_length=512)
+@st.cache_resource(show_spinner=False)
+def load_sum():
+    return pipeline("summarization", model="facebook/bart-large-cnn", max_length=250)
+embed = load_embed()
+qa_pipe  = load_qa()
+sum_pipe = load_sum()
+###############################################################################
+# Helpers
+###############################################################################
+def extract_pdf(uploaded_file):
+    """Return (plain text, image_list)"""
+    text = ""
+    images = []
+    with pdfplumber.open(BytesIO(uploaded_file.getbuffer())) as pdf:
+        for page in pdf.pages:
+            txt = page.extract_text_layout() or page.extract_text()
+            if not txt:
+                img = page.to_image(resolution=200).original
+                txt = pytesseract.image_to_string(img)
+            text += txt + "\n"
+            for img in page.images:
                 try:
+                    x0, y0, x1, y1 = img["x0"], img["y0"], img["x1"], img["y1"]
+                    pil = page.within_bbox((x0, y0, x1, y1)).to_image(resolution=200).original
+                    images.append(pil)
+                except Exception:
+                    pass
+    return text.strip(), images
+def build_index(text):
+    splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=80)
+    chunks = splitter.split_text(text)
+    vectors = embed.encode(chunks, show_progress_bar=False, batch_size=64)
+    index = FAISS.from_embeddings(list(zip(chunks, vectors)), embed)
+    return index
+def summarize(text):
+    if len(text) < 50:
+        return "Document too short to summarize."
+    # pick top 3k chars to stay within model limit
+    truncated = text[:3000]
+    return sum_pipe(truncated, max_length=250, min_length=60, do_sample=False)[0]["summary_text"]
+def answer(question, index):
+    if index is None:
+        return "Please upload & process a PDF first."
+    docs = index.similarity_search(question, k=4)
+    context = "\n".join([d.page_content for d in docs])
+    prompt = f"Answer the question using ONLY the context below.\n\nContext:\n{context}\n\nQuestion: {question}"
+    return qa_pipe(prompt, max_length=256, do_sample=False)[0]["generated_text"]
+###############################################################################
+# Session init
+###############################################################################
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+if "index" not in st.session_state:
+    st.session_state.index = None
+if "raw_text" not in st.session_state:
+    st.session_state.raw_text = ""
+if "images" not in st.session_state:
+    st.session_state.images = []
+###############################################################################
+# Sidebar
+###############################################################################
+with st.sidebar:
+    st.subheader("📁 Upload PDF")
+    uploaded = st.file_uploader("Choose a file", type="pdf", label_visibility="collapsed")
+    if uploaded and st.button("Process PDF"):
+        with st.spinner("Extracting text & images…"):
+            st.session_state.raw_text, st.session_state.images = extract_pdf(uploaded)
+            st.session_state.index = build_index(st.session_state.raw_text)
+            st.session_state.messages = []
+            st.toast("PDF ready!")
+    if st.session_state.images:
+        st.subheader("🖼️ Extracted Images")
+        for im in st.session_state.images:
+            st.image(im, use_column_width=True)
+###############################################################################
+# Main Tabs
+###############################################################################
+tab_chat, tab_sum = st.tabs(["💬 Chat", "📄 Summarize"])
+with tab_chat:
+    if st.session_state.index is None:
+        st.info("Upload & process a PDF first using the sidebar.")
+    else:
+        # history
+        for role, msg in st.session_state.messages:
+            css = "user" if role == "user" else "assistant"
+            st.markdown(f'<div class="chat-msg {css}">{msg}</div>', unsafe_allow_html=True)
+        # input
+        if question := st.chat_input("Ask anything about the PDF…"):
+            st.session_state.messages.append(("user", question))
+            st.markdown(f'<div class="chat-msg user">{question}</div>', unsafe_allow_html=True)
+            with st.spinner("Thinking…"):
+                resp = answer(question, st.session_state.index)
+            st.session_state.messages.append(("assistant", resp))
+            st.markdown(f'<div class="chat-msg assistant">{resp}</div>', unsafe_allow_html=True)
+with tab_sum:
+    if not st.session_state.raw_text:
+        st.info("Upload & process a PDF first.")
+    else:
+        if st.button("Generate Summary"):
+            with st.spinner("Summarizing…"):
+                summary = summarize(st.session_state.raw_text)
+            st.subheader("Summary")
+            st.write(summary)