getGO007 commited on
Commit
3e27771
·
verified ·
1 Parent(s): ada090d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -23
app.py CHANGED
@@ -50,31 +50,39 @@ class ChatWorkflow(Workflow):
50
  st.set_page_config(page_title="PDF Chatbot", layout="wide") # responsive layout :contentReference[oaicite:5]{index=5}
51
  st.title("📄 Chat with Your PDF")
52
 
53
- # 1) Ingest once
54
  if "index_ready" not in st.session_state:
55
- docs = LlamaParse(
56
- result_type="markdown",
57
- content_guideline_instruction=(
58
- "You are processing a company’s quarterly earnings-call slide deck. "
59
- "For each slide, produce a clearly sectioned Markdown fragment that includes:\n\n"
60
- "1. **Slide metadata**: slide number, title, and any subtitle or date\n"
61
- "2. **Key bullet points**: preserve existing bullets, but rewrite for clarity\n"
62
- "3. **Tables**: convert any tables into Markdown tables, capturing headers and all rows\n"
63
- "4. **Charts & graphs**: summarize each chart/graph in prose, highlighting axes labels, trends, and top 3 data points or percentage changes\n"
64
- "5. **Figures & images**: if there’s a figure caption, include it verbatim; otherwise, describe the visual in one sentence\n"
65
- "6. **Numeric callouts**: pull out any KPIs (revenue, EPS, growth rates) into a “Metrics” subsection\n"
66
- "7. **Overall slide summary**: a 1–2-sentence plain-English takeaway for the slide’s purpose or conclusion\n\n"
67
- "Keep the output strictly in Markdown, using headings (`##`, `###`), lists (`-`), and tables syntax. "
68
- "Do not include any LLM-specific commentary or markdown outside these rules."
 
 
 
 
 
 
 
 
 
 
 
 
69
  )
70
- ).load_data(PDF_PATH) # tailored parsing instruction
71
- idx = VectorStoreIndex.from_documents(
72
- docs,
73
- embed_model=OpenAIEmbedding(model_name="text-embedding-3-small")
74
- )
75
- idx.storage_context.persist(persist_dir=INDEX_DIR)
76
- st.session_state.index_ready = True
77
- st.success("📚 Indexed your document!") # user feedback
78
 
79
  # 2) Initialize memory & workflow
80
  if "memory" not in st.session_state:
 
50
  st.set_page_config(page_title="PDF Chatbot", layout="wide") # responsive layout :contentReference[oaicite:5]{index=5}
51
  st.title("📄 Chat with Your PDF")
52
 
53
+ # 1) Ingest once or load existing index via index_store.json
54
  if "index_ready" not in st.session_state:
55
+ index_meta = os.path.join(INDEX_DIR, "index_store.json")
56
+ if os.path.isfile(index_meta):
57
+ # Found LlamaIndex metadata → reuse existing index
58
+ st.session_state.index_ready = True
59
+ st.success("📚 Loaded existing index from index_store.json!")
60
+ else:
61
+ # No index_store.json build index now
62
+ docs = LlamaParse(
63
+ result_type="markdown",
64
+ content_guideline_instruction=(
65
+ "You are processing a company’s quarterly earnings-call slide deck. "
66
+ "For each slide, produce a clearly sectioned Markdown fragment that includes:\n\n"
67
+ "1. **Slide metadata**: slide number, title, and any subtitle or date\n"
68
+ "2. **Key bullet points**: preserve existing bullets, but rewrite for clarity\n"
69
+ "3. **Tables**: convert any tables into Markdown tables, capturing headers and all rows\n"
70
+ "4. **Charts & graphs**: summarize each chart/graph in prose, highlighting axes labels, trends, and top 3 data points or percentage changes\n"
71
+ "5. **Figures & images**: if there’s a figure caption, include it verbatim; otherwise, describe the visual in one sentence\n"
72
+ "6. **Numeric callouts**: pull out any KPIs (revenue, EPS, growth rates) into a “Metrics” subsection\n"
73
+ "7. **Overall slide summary**: a 1–2-sentence plain-English takeaway for the slide’s purpose or conclusion\n\n"
74
+ "Keep the output strictly in Markdown, using headings (`##`, `###`), lists (`-`), and tables syntax. "
75
+ "Do not include any LLM-specific commentary or markdown outside these rules."
76
+ )
77
+ ).load_data(PDF_PATH)
78
+ idx = VectorStoreIndex.from_documents(
79
+ docs,
80
+ embed_model=OpenAIEmbedding(model_name="text-embedding-3-small")
81
  )
82
+ idx.storage_context.persist(persist_dir=INDEX_DIR)
83
+ st.session_state.index_ready = True
84
+ st.success("📚 Indexed your document and created index_store.json!") # user feedback
85
+
 
 
 
 
86
 
87
  # 2) Initialize memory & workflow
88
  if "memory" not in st.session_state: