getGO007 commited on
Commit
1d253ce
Β·
verified Β·
1 Parent(s): 3d9fe53

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -0
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import asyncio
4
+ import nest_asyncio
5
+ nest_asyncio.apply() # allow asyncio in Streamlit :contentReference[oaicite:3]{index=3}
6
+
7
+ # ─── LlamaIndex & Parser Imports ────────────────────────────────
8
+ from llama_index import StorageContext, load_index_from_storage, OpenAI
9
+ from llama_parse import LlamaParse
10
+ from llama_index import VectorStoreIndex
11
+ from llama_index.embeddings.openai import OpenAIEmbedding
12
+ from llama_index.core.workflow.workflow import Workflow
13
+ from llama_index.core.workflow.step_decorator import step
14
+ from llama_index.core.workflow.events import Event, StartEvent, StopEvent, Context
15
+ from llama_index.core.memory import ChatMemoryBuffer
16
+
17
+ # ─── Constants ───────────────────────────────────────────────────
18
+ PDF_PATH = "./data/my_doc.pdf" # your single PDF
19
+ INDEX_DIR = "./index_data"
20
+ SYSTEM_PROMPT = (
21
+ "You are an expert analyst, who excels in analyzing a company's earnings call deck. Answer questions ONLY from the indexed document."
22
+ )
23
+
24
+ # ─── Workflow Definition ─────────────────────────────────────────
25
+ class ChatResponseEvent(Event):
26
+ response: str
27
+ memory: ChatMemoryBuffer
28
+
29
+ class ChatWorkflow(Workflow):
30
+ @step
31
+ async def answer(self, ev: StartEvent) -> ChatResponseEvent:
32
+ # load index
33
+ storage = StorageContext.from_defaults(persist_dir=ev.index_dir)
34
+ index = load_index_from_storage(storage)
35
+ # build chat engine with memory & prompt
36
+ chat_engine = index.as_chat_engine(
37
+ chat_mode="context",
38
+ memory=ev.memory,
39
+ system_prompt=ev.system_prompt,
40
+ llm=ev.llm
41
+ ) # EDADW chat mode :contentReference[oaicite:4]{index=4}
42
+ # single-turn chat
43
+ resp = chat_engine.chat(ev.query)
44
+ return ChatResponseEvent(response=resp.response, memory=ev.memory)
45
+
46
+ @step
47
+ async def finalize(self, ev: ChatResponseEvent) -> StopEvent:
48
+ return StopEvent(result=ev.response)
49
+
50
+ # ─── Streamlit UI & Session State ────────────────────────────────
51
+ st.set_page_config(page_title="PDF Chatbot", layout="wide") # responsive layout :contentReference[oaicite:5]{index=5}
52
+ st.title("πŸ“„ Chat with Your PDF")
53
+
54
+ # 1) Ingest once
55
+ if "index_ready" not in st.session_state:
56
+ docs = LlamaParse(
57
+ result_type="markdown",
58
+ content_guideline_instruction=(
59
+ "You are processing a company’s quarterly earnings-call slide deck. "
60
+ "For each slide, produce a clearly sectioned Markdown fragment that includes:\n\n"
61
+ "1. **Slide metadata**: slide number, title, and any subtitle or date\n"
62
+ "2. **Key bullet points**: preserve existing bullets, but rewrite for clarity\n"
63
+ "3. **Tables**: convert any tables into Markdown tables, capturing headers and all rows\n"
64
+ "4. **Charts & graphs**: summarize each chart/graph in prose, highlighting axes labels, trends, and top 3 data points or percentage changes\n"
65
+ "5. **Figures & images**: if there’s a figure caption, include it verbatim; otherwise, describe the visual in one sentence\n"
66
+ "6. **Numeric callouts**: pull out any KPIs (revenue, EPS, growth rates) into a β€œMetrics” subsection\n"
67
+ "7. **Overall slide summary**: a 1–2-sentence plain-English takeaway for the slide’s purpose or conclusion\n\n"
68
+ "Keep the output strictly in Markdown, using headings (`##`, `###`), lists (`-`), and tables syntax. "
69
+ "Do not include any LLM-specific commentary or markdown outside these rules."
70
+ )
71
+ ).load_data(PDF_PATH) # tailored parsing instruction
72
+ idx = VectorStoreIndex.from_documents(
73
+ docs,
74
+ embed_model=OpenAIEmbedding(model_name="text-embedding-3-small")
75
+ )
76
+ idx.storage_context.persist(persist_dir=INDEX_DIR)
77
+ st.session_state.index_ready = True
78
+ st.success("πŸ“š Indexed your document!") # user feedback
79
+
80
+ # 2) Initialize memory & workflow
81
+ if "memory" not in st.session_state:
82
+ st.session_state.memory = ChatMemoryBuffer.from_defaults(
83
+ llm=OpenAI(model="gpt-4o"), token_limit=1500
84
+ ) # simple chat memory :contentReference[oaicite:6]{index=6}
85
+ if "workflow" not in st.session_state:
86
+ st.session_state.workflow = ChatWorkflow(timeout=None, verbose=False)
87
+
88
+ # 3) User input
89
+ user_input = st.text_input("Ask a question about the document:")
90
+ if user_input:
91
+ stop_evt: StopEvent = asyncio.run(
92
+ st.session_state.workflow.run(
93
+ index_dir=INDEX_DIR,
94
+ query=user_input,
95
+ system_prompt=SYSTEM_PROMPT,
96
+ memory=st.session_state.memory,
97
+ llm=OpenAI(model="gpt-4o")
98
+ )
99
+ )
100
+ st.session_state.memory = stop_evt.memory
101
+ st.markdown(f"**Bot:** {stop_evt.result}")
102
+
103
+ # 4) End Chat button
104
+ if st.button("End Chat"):
105
+ st.write("Chat ended. Refresh to start over.")
106
+ st.stop()