Update app.py
Browse files
app.py
CHANGED
@@ -50,31 +50,39 @@ class ChatWorkflow(Workflow):
|
|
50 |
st.set_page_config(page_title="PDF Chatbot", layout="wide") # responsive layout :contentReference[oaicite:5]{index=5}
|
51 |
st.title("📄 Chat with Your PDF")
|
52 |
|
53 |
-
# 1) Ingest once
|
54 |
if "index_ready" not in st.session_state:
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
"
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
)
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
)
|
75 |
-
idx.storage_context.persist(persist_dir=INDEX_DIR)
|
76 |
-
st.session_state.index_ready = True
|
77 |
-
st.success("📚 Indexed your document!") # user feedback
|
78 |
|
79 |
# 2) Initialize memory & workflow
|
80 |
if "memory" not in st.session_state:
|
|
|
50 |
st.set_page_config(page_title="PDF Chatbot", layout="wide") # responsive layout :contentReference[oaicite:5]{index=5}
|
51 |
st.title("📄 Chat with Your PDF")
|
52 |
|
53 |
+
# 1) Ingest once or load existing index via index_store.json
|
54 |
if "index_ready" not in st.session_state:
|
55 |
+
index_meta = os.path.join(INDEX_DIR, "index_store.json")
|
56 |
+
if os.path.isfile(index_meta):
|
57 |
+
# Found LlamaIndex metadata → reuse existing index
|
58 |
+
st.session_state.index_ready = True
|
59 |
+
st.success("📚 Loaded existing index from index_store.json!")
|
60 |
+
else:
|
61 |
+
# No index_store.json → build index now
|
62 |
+
docs = LlamaParse(
|
63 |
+
result_type="markdown",
|
64 |
+
content_guideline_instruction=(
|
65 |
+
"You are processing a company’s quarterly earnings-call slide deck. "
|
66 |
+
"For each slide, produce a clearly sectioned Markdown fragment that includes:\n\n"
|
67 |
+
"1. **Slide metadata**: slide number, title, and any subtitle or date\n"
|
68 |
+
"2. **Key bullet points**: preserve existing bullets, but rewrite for clarity\n"
|
69 |
+
"3. **Tables**: convert any tables into Markdown tables, capturing headers and all rows\n"
|
70 |
+
"4. **Charts & graphs**: summarize each chart/graph in prose, highlighting axes labels, trends, and top 3 data points or percentage changes\n"
|
71 |
+
"5. **Figures & images**: if there’s a figure caption, include it verbatim; otherwise, describe the visual in one sentence\n"
|
72 |
+
"6. **Numeric callouts**: pull out any KPIs (revenue, EPS, growth rates) into a “Metrics” subsection\n"
|
73 |
+
"7. **Overall slide summary**: a 1–2-sentence plain-English takeaway for the slide’s purpose or conclusion\n\n"
|
74 |
+
"Keep the output strictly in Markdown, using headings (`##`, `###`), lists (`-`), and tables syntax. "
|
75 |
+
"Do not include any LLM-specific commentary or markdown outside these rules."
|
76 |
+
)
|
77 |
+
).load_data(PDF_PATH)
|
78 |
+
idx = VectorStoreIndex.from_documents(
|
79 |
+
docs,
|
80 |
+
embed_model=OpenAIEmbedding(model_name="text-embedding-3-small")
|
81 |
)
|
82 |
+
idx.storage_context.persist(persist_dir=INDEX_DIR)
|
83 |
+
st.session_state.index_ready = True
|
84 |
+
st.success("📚 Indexed your document and created index_store.json!") # user feedback
|
85 |
+
|
|
|
|
|
|
|
|
|
86 |
|
87 |
# 2) Initialize memory & workflow
|
88 |
if "memory" not in st.session_state:
|