Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import streamlit as st
|
3 |
+
import asyncio
|
4 |
+
import nest_asyncio
|
5 |
+
nest_asyncio.apply() # allow asyncio in Streamlit :contentReference[oaicite:3]{index=3}
|
6 |
+
|
7 |
+
# βββ LlamaIndex & Parser Imports ββββββββββββββββββββββββββββββββ
|
8 |
+
from llama_index import StorageContext, load_index_from_storage, OpenAI
|
9 |
+
from llama_parse import LlamaParse
|
10 |
+
from llama_index import VectorStoreIndex
|
11 |
+
from llama_index.embeddings.openai import OpenAIEmbedding
|
12 |
+
from llama_index.core.workflow.workflow import Workflow
|
13 |
+
from llama_index.core.workflow.step_decorator import step
|
14 |
+
from llama_index.core.workflow.events import Event, StartEvent, StopEvent, Context
|
15 |
+
from llama_index.core.memory import ChatMemoryBuffer
|
16 |
+
|
17 |
+
# βββ Constants βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
18 |
+
PDF_PATH = "./data/my_doc.pdf" # your single PDF
|
19 |
+
INDEX_DIR = "./index_data"
|
20 |
+
SYSTEM_PROMPT = (
|
21 |
+
"You are an expert analyst, who excels in analyzing a company's earnings call deck. Answer questions ONLY from the indexed document."
|
22 |
+
)
|
23 |
+
|
24 |
+
# βββ Workflow Definition βββββββββββββββββββββββββββββββββββββββββ
|
25 |
+
class ChatResponseEvent(Event):
|
26 |
+
response: str
|
27 |
+
memory: ChatMemoryBuffer
|
28 |
+
|
29 |
+
class ChatWorkflow(Workflow):
|
30 |
+
@step
|
31 |
+
async def answer(self, ev: StartEvent) -> ChatResponseEvent:
|
32 |
+
# load index
|
33 |
+
storage = StorageContext.from_defaults(persist_dir=ev.index_dir)
|
34 |
+
index = load_index_from_storage(storage)
|
35 |
+
# build chat engine with memory & prompt
|
36 |
+
chat_engine = index.as_chat_engine(
|
37 |
+
chat_mode="context",
|
38 |
+
memory=ev.memory,
|
39 |
+
system_prompt=ev.system_prompt,
|
40 |
+
llm=ev.llm
|
41 |
+
) # EDADW chat mode :contentReference[oaicite:4]{index=4}
|
42 |
+
# single-turn chat
|
43 |
+
resp = chat_engine.chat(ev.query)
|
44 |
+
return ChatResponseEvent(response=resp.response, memory=ev.memory)
|
45 |
+
|
46 |
+
@step
|
47 |
+
async def finalize(self, ev: ChatResponseEvent) -> StopEvent:
|
48 |
+
return StopEvent(result=ev.response)
|
49 |
+
|
50 |
+
# βββ Streamlit UI & Session State ββββββββββββββββββββββββββββββββ
|
51 |
+
st.set_page_config(page_title="PDF Chatbot", layout="wide") # responsive layout :contentReference[oaicite:5]{index=5}
|
52 |
+
st.title("π Chat with Your PDF")
|
53 |
+
|
54 |
+
# 1) Ingest once
|
55 |
+
if "index_ready" not in st.session_state:
|
56 |
+
docs = LlamaParse(
|
57 |
+
result_type="markdown",
|
58 |
+
content_guideline_instruction=(
|
59 |
+
"You are processing a companyβs quarterly earnings-call slide deck. "
|
60 |
+
"For each slide, produce a clearly sectioned Markdown fragment that includes:\n\n"
|
61 |
+
"1. **Slide metadata**: slide number, title, and any subtitle or date\n"
|
62 |
+
"2. **Key bullet points**: preserve existing bullets, but rewrite for clarity\n"
|
63 |
+
"3. **Tables**: convert any tables into Markdown tables, capturing headers and all rows\n"
|
64 |
+
"4. **Charts & graphs**: summarize each chart/graph in prose, highlighting axes labels, trends, and top 3 data points or percentage changes\n"
|
65 |
+
"5. **Figures & images**: if thereβs a figure caption, include it verbatim; otherwise, describe the visual in one sentence\n"
|
66 |
+
"6. **Numeric callouts**: pull out any KPIs (revenue, EPS, growth rates) into a βMetricsβ subsection\n"
|
67 |
+
"7. **Overall slide summary**: a 1β2-sentence plain-English takeaway for the slideβs purpose or conclusion\n\n"
|
68 |
+
"Keep the output strictly in Markdown, using headings (`##`, `###`), lists (`-`), and tables syntax. "
|
69 |
+
"Do not include any LLM-specific commentary or markdown outside these rules."
|
70 |
+
)
|
71 |
+
).load_data(PDF_PATH) # tailored parsing instruction
|
72 |
+
idx = VectorStoreIndex.from_documents(
|
73 |
+
docs,
|
74 |
+
embed_model=OpenAIEmbedding(model_name="text-embedding-3-small")
|
75 |
+
)
|
76 |
+
idx.storage_context.persist(persist_dir=INDEX_DIR)
|
77 |
+
st.session_state.index_ready = True
|
78 |
+
st.success("π Indexed your document!") # user feedback
|
79 |
+
|
80 |
+
# 2) Initialize memory & workflow
|
81 |
+
if "memory" not in st.session_state:
|
82 |
+
st.session_state.memory = ChatMemoryBuffer.from_defaults(
|
83 |
+
llm=OpenAI(model="gpt-4o"), token_limit=1500
|
84 |
+
) # simple chat memory :contentReference[oaicite:6]{index=6}
|
85 |
+
if "workflow" not in st.session_state:
|
86 |
+
st.session_state.workflow = ChatWorkflow(timeout=None, verbose=False)
|
87 |
+
|
88 |
+
# 3) User input
|
89 |
+
user_input = st.text_input("Ask a question about the document:")
|
90 |
+
if user_input:
|
91 |
+
stop_evt: StopEvent = asyncio.run(
|
92 |
+
st.session_state.workflow.run(
|
93 |
+
index_dir=INDEX_DIR,
|
94 |
+
query=user_input,
|
95 |
+
system_prompt=SYSTEM_PROMPT,
|
96 |
+
memory=st.session_state.memory,
|
97 |
+
llm=OpenAI(model="gpt-4o")
|
98 |
+
)
|
99 |
+
)
|
100 |
+
st.session_state.memory = stop_evt.memory
|
101 |
+
st.markdown(f"**Bot:** {stop_evt.result}")
|
102 |
+
|
103 |
+
# 4) End Chat button
|
104 |
+
if st.button("End Chat"):
|
105 |
+
st.write("Chat ended. Refresh to start over.")
|
106 |
+
st.stop()
|