|
import os |
|
import streamlit as st |
|
import nest_asyncio |
|
|
|
|
|
nest_asyncio.apply() |
|
import asyncio |
|
|
|
|
|
|
|
|
|
from llama_index.core import StorageContext, load_index_from_storage |
|
from llama_index.llms.openai import OpenAI |
|
from llama_parse import LlamaParse |
|
from llama_index.core import VectorStoreIndex |
|
from llama_index.embeddings.openai import OpenAIEmbedding |
|
from llama_index.core.workflow import Event, StartEvent, StopEvent, Workflow, step, Context |
|
from llama_index.core.memory import ChatMemoryBuffer |
|
|
|
|
|
PDF_PATH = "./data/bank-of-america.pdf" |
|
INDEX_DIR = "./index_data" |
|
SYSTEM_PROMPT = ( |
|
"You are an expert analyst, who excels in analyzing a company's earnings call deck. " |
|
"Answer questions ONLY from the indexed document." |
|
) |
|
|
|
|
|
class ChatResponseEvent(Event): |
|
response: str |
|
memory: ChatMemoryBuffer |
|
|
|
class ChatWorkflow(Workflow): |
|
@step |
|
async def answer(self, ev: StartEvent) -> ChatResponseEvent: |
|
storage = StorageContext.from_defaults(persist_dir=ev.index_dir) |
|
index = load_index_from_storage(storage) |
|
chat_engine = index.as_chat_engine( |
|
chat_mode="context", |
|
memory=ev.memory, |
|
system_prompt=ev.system_prompt, |
|
llm=ev.llm |
|
) |
|
|
|
resp = chat_engine.chat(ev.query) |
|
return ChatResponseEvent(response=resp.response, memory=ev.memory) |
|
|
|
@step |
|
async def finalize(self, ev: ChatResponseEvent) -> StopEvent: |
|
return StopEvent(result=ev.response) |
|
|
|
|
|
st.set_page_config(page_title="PDF Chatbot", layout="wide") |
|
st.title("π Chat with Your PDF") |
|
|
|
|
|
if "index_ready" not in st.session_state: |
|
os.makedirs(INDEX_DIR, exist_ok=True) |
|
index_meta = os.path.join(INDEX_DIR, "index_store.json") |
|
if os.path.isfile(index_meta): |
|
st.session_state.index_ready = True |
|
st.success("π Loaded existing index!") |
|
else: |
|
docs = LlamaParse( |
|
result_type="markdown", |
|
content_guideline_instruction="You are processing a companyβs quarterly earnings-call slide deck. " |
|
"For each slide, produce a clearly sectioned Markdown fragment that includes:\n\n" |
|
"1. **Slide metadata**: slide number, title, and any subtitle or date\n" |
|
"2. **Key bullet points**: preserve existing bullets, but rewrite for clarity\n" |
|
"3. **Tables**: convert any tables into Markdown tables, capturing headers and all rows\n" |
|
"4. **Charts & graphs**: summarize each chart/graph in prose, highlighting axes labels, trends, and top 3 data points or percentage changes\n" |
|
"5. **Figures & images**: if thereβs a figure caption, include it verbatim; otherwise, describe the visual in one sentence\n" |
|
"6. **Numeric callouts**: pull out any KPIs (revenue, EPS, growth rates) into a βMetricsβ subsection\n" |
|
"7. **Overall slide summary**: a 1β2-sentence plain-English takeaway for the slideβs purpose or conclusion\n\n" |
|
"Keep the output strictly in Markdown, using headings (`##`, `###`), lists (`-`), and tables syntax. " |
|
"Do not include any LLM-specific commentary or markdown outside these rules." |
|
).load_data(PDF_PATH) |
|
idx = VectorStoreIndex.from_documents( |
|
docs, |
|
embed_model=OpenAIEmbedding(model_name="text-embedding-3-small") |
|
) |
|
idx.storage_context.persist(persist_dir=INDEX_DIR) |
|
st.session_state.index_ready = True |
|
st.success("π Indexed your document and created index_store.json!") |
|
|
|
|
|
if "memory" not in st.session_state: |
|
st.session_state.memory = ChatMemoryBuffer.from_defaults( |
|
llm=OpenAI(model="gpt-4o"), token_limit=1500 |
|
) |
|
if "workflow" not in st.session_state: |
|
st.session_state.workflow = ChatWorkflow(timeout=None, verbose=False) |
|
|
|
|
|
user_input = st.text_input("Ask a question about the document:") |
|
if user_input: |
|
|
|
loop = asyncio.get_event_loop() |
|
|
|
|
|
future = asyncio.run_coroutine_threadsafe( |
|
st.session_state.workflow.run( |
|
index_dir=INDEX_DIR, |
|
query=user_input, |
|
system_prompt=SYSTEM_PROMPT, |
|
memory=st.session_state.memory, |
|
llm=OpenAI(model="gpt-4o") |
|
), |
|
loop |
|
) |
|
|
|
|
|
stop_evt: StopEvent = future.result() |
|
|
|
|
|
st.session_state.memory = stop_evt.memory |
|
st.markdown(f"**Bot:** {stop_evt.result}") |
|
|
|
|
|
if st.button("End Chat"): |
|
st.write("Chat ended. Refresh to start over.") |
|
st.stop() |
|
|