Spaces:

DrishtiSharma
/

knowledge-explorer

Sleeping

App Files Files Community

knowledge-explorer / src /streamlit_app.py

DrishtiSharma

Update src/streamlit_app.py

a7b0743 verified 25 days ago

raw

history blame contribute delete

10.5 kB

	import os
	import re
	import json
	import traceback
	import streamlit as st
	from pathlib import Path
	from typing import List, Annotated, Any
	import chromadb
	import operator
	import tempfile
	from tqdm import tqdm
	from pydantic import BaseModel
	from langchain.embeddings.cohere import CohereEmbeddings
	from langchain_cohere import ChatCohere
	from langchain.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.vectorstores import Chroma
	import cohere
	from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, AIMessage
	from langgraph.graph import StateGraph, START, END, add_messages
	from langgraph.constants import Send
	from langgraph.checkpoint.memory import MemorySaver

	chromadb.api.client.SharedSystemClient.clear_system_cache()

	COHERE_API_KEY = os.environ["COHERE_API_KEY"]
	co = cohere.Client(COHERE_API_KEY)

	documents_path = Path(__file__).parent / "documents"
	persist_dir = tempfile.mkdtemp()

	def prepare_vectorstore(uploaded_files=None):
	documents = []

	if uploaded_files and any(file.size > 0 for file in uploaded_files):
	st.write("📁 Uploaded files:")
	for file in uploaded_files:
	st.write(f"• {file.name} ({file.size} bytes)")
	file_path = Path(tempfile.gettempdir()) / file.name
	try:
	with open(file_path, "wb") as f:
	f.write(file.getbuffer())
	st.write(f"✅ Saved to: {file_path}")

	if file.name.endswith(".pdf"):
	st.write(f"📄 Loading PDF: {file.name}")
	loader = PyPDFLoader(str(file_path))
	elif file.name.endswith(".txt"):
	st.write(f"📃 Loading TXT: {file.name}")
	loader = TextLoader(str(file_path))
	else:
	st.warning(f"Unsupported file type: {file.name}")
	continue

	loaded = loader.load()
	st.write(f"Loaded {len(loaded)} pages from {file.name}")
	documents.extend(loaded)

	except Exception as e:
	st.error(f"Error loading {file.name}:")
	st.exception(e)
	st.text(traceback.format_exc())

	else:
	st.warning("No uploaded files found or all were empty.")
	st.stop()

	if not documents:
	st.error("No content could be loaded from the uploaded files.")
	st.stop()

	st.write("Splitting documents into chunks...")
	splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
	docs = splitter.split_documents(documents)
	st.write(f"Total chunks created: {len(docs)}")

	if not docs:
	st.error("No content found in the documents after splitting.")
	st.stop()

	st.write("Embedding documents...")
	embedding = CohereEmbeddings(
	model="embed-multilingual-light-v3.0",
	cohere_api_key=COHERE_API_KEY,
	user_agent="langgraph-app"
	)

	try:
	vectorstore = Chroma.from_documents(
	documents=tqdm(docs, desc="Embedding"),
	embedding=embedding,
	persist_directory=persist_dir
	)
	vectorstore.persist()
	st.success("Document embedding complete.")
	return vectorstore
	except Exception as e:
	st.error("Embedding failed:")
	st.exception(e)
	st.text(traceback.format_exc())
	st.stop()

	class State(BaseModel):
	state: List[str] = []
	messages: Annotated[list[AnyMessage], add_messages]
	topic: List[str] = []
	context: List[str] = []
	sub_topic_list: List[str] = []
	sub_topics: Annotated[list[AnyMessage], add_messages]
	stories: Annotated[list[AnyMessage], add_messages]
	stories_lst: Annotated[list, operator.add]

	class StoryState(BaseModel):
	retrieved_docs: List[Any] = []
	stories: Annotated[list[AnyMessage], add_messages]
	reranked_docs: List[str] = []
	story_topic: str = ""
	stories_lst: Annotated[list, operator.add]

	def extract_topics(messages):
	topics = []
	for message in messages:
	topics.extend(re.findall(r'- \\(.?)\\*', message.content))
	return topics

	embedding_llm = CohereEmbeddings(
	model="embed-multilingual-light-v3.0",
	cohere_api_key=COHERE_API_KEY,
	user_agent="langgraph-app"
	)

	llm = ChatCohere(
	api_version="2024-02-15-preview",
	temperature=0.7,
	model="command-r-plus-08-2024",
	cohere_api_key=COHERE_API_KEY
	)

	beginner_topic_sys_msg = SystemMessage(content="Suppose you are a middle grader who wants to learn constantly about new topics to get a good score in exams.")
	middle_topic_sys_msg = SystemMessage(content="Suppose you are a college student who wants to learn constantly about new topics to get a good score in exams.")
	advanced_topic_sys_msg = SystemMessage(content="Suppose you are a teacher who wants to learn constantly about new topics to teach your students.")

	def retrieve_node(state):
	topic = state.story_topic
	query = f"information about {topic}"
	retriever = Chroma(persist_directory=persist_dir, embedding_function=embedding_llm).as_retriever(search_kwargs={"k": 20})
	docs = retriever.get_relevant_documents(query)
	return {"retrieved_docs": docs, "question": query}

	def rerank_node(state):
	topic = state.story_topic
	query = f"Rerank documents based on how good they explain the topic {topic}"
	docs = state.retrieved_docs
	texts = [doc.page_content for doc in docs]
	rerank_results = co.rerank(query=query, documents=texts, top_n=5, model="rerank-v3.5")
	top_docs = [texts[result.index] for result in rerank_results.results]
	return {"reranked_docs": top_docs, "question": query}

	def generate_story_node(state):
	context = "\n\n".join(state.reranked_docs)
	topic = state.story_topic
	system_message = """
	Suppose you're an amazing story writer and scientific thinker.
	You've written hundreds of story books explaining scientific topics in a childlike manner.
	You add a subtle humor to your stories to make them more engaging.
	"""
	prompt = f"""
	Use the following context to generate a simple engaging story that explains {topic} in such a way a middle schooler can understand it.

	Context:
	{context}

	Story:
	"""
	response = llm.invoke([SystemMessage(system_message), HumanMessage(prompt)])
	return {"stories": response}

	def beginner_topic(state: State):
	prompt = f"What are the beginner-level topics you can learn about {', '.join(state.topic)} in {', '.join(state.context)}?"
	sub_topics = [llm.invoke([beginner_topic_sys_msg] + [prompt])]
	return {"message": sub_topics[0], "sub_topics": sub_topics[0]}

	def middle_topic(state: State):
	prompt = f"What are the middle-level topics you can learn about {', '.join(state.topic)} in {', '.join(state.context)}? Don't include the topics below:\n\n{(state.sub_topics)}"
	sub_topics = [llm.invoke([middle_topic_sys_msg] + [prompt])]
	return {"message": sub_topics, "sub_topics": sub_topics}

	def advanced_topic(state: State):
	prompt = f"What are the advanced-level topics you can learn about {', '.join(state.topic)} in {', '.join(state.context)}? Don't include the topics below:\n\n{(state.sub_topics)}"
	sub_topics = [llm.invoke([advanced_topic_sys_msg] + [prompt])]
	return {"message": sub_topics, "sub_topics": sub_topics}

	def topic_extractor(state: State):
	return {"sub_topic_list": extract_topics(state.sub_topics)}

	def dynamic_topic_edges(state: State):
	return [Send("story_generator", {"story_topic": topic}) for topic in state.sub_topic_list]

	story_builder = StateGraph(StoryState)
	story_builder.add_node("Retrieve", retrieve_node)
	story_builder.add_node("Rerank", rerank_node)
	story_builder.add_node("Generate", generate_story_node)
	story_builder.set_entry_point("Retrieve")
	story_builder.add_edge("Retrieve", "Rerank")
	story_builder.add_edge("Rerank", "Generate")
	story_builder.set_finish_point("Generate")

	story_graph = story_builder.compile()

	main_builder = StateGraph(State)
	main_builder.add_node("beginner_topic", beginner_topic)
	main_builder.add_node("middle_topic", middle_topic)
	main_builder.add_node("advanced_topic", advanced_topic)
	main_builder.add_node("topic_extractor", topic_extractor)
	main_builder.add_node("story_generator", story_graph)
	main_builder.add_edge(START, "beginner_topic")
	main_builder.add_edge("beginner_topic", "middle_topic")
	main_builder.add_edge("middle_topic", "advanced_topic")
	main_builder.add_edge("advanced_topic", "topic_extractor")
	main_builder.add_conditional_edges("topic_extractor", dynamic_topic_edges, ["story_generator"])
	main_builder.add_edge("story_generator", END)

	memory = MemorySaver()
	react_graph = main_builder.compile(checkpointer=memory, interrupt_after=["topic_extractor"])

	st.title("LangGraph Topic Story Generator")

	uploaded_files = st.file_uploader(
	"Upload .txt or .pdf files",
	type=["txt", "pdf"],
	accept_multiple_files=True,
	key="file_uploader"
	)

	if uploaded_files:
	st.session_state["files"] = uploaded_files
	st.success(f"{len(uploaded_files)} file(s) uploaded:")
	for file in uploaded_files:
	st.write(f"• {file.name} ({file.size} bytes)")
	elif "files" in st.session_state:
	st.info("Using previously uploaded files:")
	for file in st.session_state["files"]:
	st.write(f"• {file.name} ({file.size} bytes)")
	else:
	st.info("No files uploaded yet.")

	topic = st.text_input("Enter a topic", "Human Evolution")
	context = st.text_input("Enter a context", "Science")

	if st.button("Generate Stories"):
	uploaded = st.session_state.get("files")
	if not uploaded or all(file.size == 0 for file in uploaded):
	st.warning("You uploaded files, but they appear to be empty.")
	st.stop()

	try:
	prepare_vectorstore(uploaded)
	thread = {"configurable": {"thread_id": "1"}}
	react_graph.invoke({"topic": [topic], "context": [context]}, thread)
	react_graph.update_state(thread, {"sub_topic_list": ['Early Hominins', 'Fossil Evidence', "Darwin's Theory of Evolution"]})
	result = react_graph.invoke(None, thread, stream_mode="values")
	for story in result["stories"]:
	st.markdown(story.content)
	except Exception as e:
	st.error("Something went wrong during story generation.")
	st.exception(e)
	st.text(traceback.format_exc())