Spaces:

Priyank1001
/

GATE-CSE-Chatbot

Build error

App Files Files Community

Priyank1001 commited on Apr 6

Commit

71ce55a

verified ·

1 Parent(s): 35bf616

initial commit

Browse files

Files changed (16) hide show

.gitattributes +10 -0
app.py +51 -0
create_memory_for_llm.py +63 -0
data/(CN 1) Computer Network.pdf +3 -0
data/(CN 2) DATA_COMMUNICATIONS_AND_NETWORKING_McGra.pdf +3 -0
data/(DSA) Introduction to Algorithms - 3rd Edition.pdf +3 -0
data/(OS) Operating System Concepts (9th Ed) - Gagne, Silberschatz, and Galvin.pdf +3 -0
data/CAO Hamacher.pdf +3 -0
data/DBMS Korth.pdf +3 -0
data/Discrete Mathematics and Its Applications Seventh Edition by Kenneth H. Rosen.pdf +3 -0
data/TOC.pdf +3 -0
models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf +3 -0
rag_qa.py +79 -0
requirements.txt +11 -0
vectorstore/faiss_db/index.faiss +3 -0
vectorstore/faiss_db/index.pkl +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/(CN[[:space:]]1)[[:space:]][[:space:]]Computer[[:space:]]Network.pdf filter=lfs diff=lfs merge=lfs -text
+data/(CN[[:space:]]2)[[:space:]]DATA_COMMUNICATIONS_AND_NETWORKING_McGra.pdf filter=lfs diff=lfs merge=lfs -text
+data/(DSA)[[:space:]]Introduction[[:space:]]to[[:space:]]Algorithms[[:space:]]-[[:space:]]3rd[[:space:]]Edition.pdf filter=lfs diff=lfs merge=lfs -text
+data/(OS)[[:space:]]Operating[[:space:]]System[[:space:]]Concepts[[:space:]](9th[[:space:]]Ed)[[:space:]]-[[:space:]]Gagne,[[:space:]]Silberschatz,[[:space:]]and[[:space:]]Galvin.pdf filter=lfs diff=lfs merge=lfs -text
+data/CAO[[:space:]]Hamacher.pdf filter=lfs diff=lfs merge=lfs -text
+data/DBMS[[:space:]]Korth.pdf filter=lfs diff=lfs merge=lfs -text
+data/Discrete[[:space:]]Mathematics[[:space:]]and[[:space:]]Its[[:space:]]Applications[[:space:]]Seventh[[:space:]]Edition[[:space:]]by[[:space:]]Kenneth[[:space:]]H.[[:space:]]Rosen.pdf filter=lfs diff=lfs merge=lfs -text
+data/TOC.pdf filter=lfs diff=lfs merge=lfs -text
+models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
+vectorstore/faiss_db/index.faiss filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import streamlit as st
+from rag_qa import load_vectorstore, load_llm, generate_answer
+FAISS_DB_PATH = "vectorstore/faiss_db"
+LLM_MODEL_PATH = "models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
+EMBEDDING_MODEL_NAME = "intfloat/e5-small"
+@st.cache_resource(show_spinner=False)
+def get_retriever_and_llm():
+    vectorstore = load_vectorstore(FAISS_DB_PATH, EMBEDDING_MODEL_NAME)
+    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
+    llm = load_llm(LLM_MODEL_PATH)
+    return retriever, llm
+def main():
+    st.set_page_config(page_title="GATE CSE Assistant", page_icon="🎓")
+    st.markdown("<h1 style='text-align: center;'>🎓 GATE CSE Assistant</h1>", unsafe_allow_html=True)
+    st.markdown("<p style='text-align: center;'>Ask any technical question related to GATE Computer Science syllabus.</p>", unsafe_allow_html=True)
+    # Session state for chat history
+    if "chat_history" not in st.session_state:
+        st.session_state.chat_history = []
+    # Sidebar Chat History
+    with st.sidebar:
+        st.title("🧾 Chat History")
+        if st.session_state.chat_history:
+            for i, chat in enumerate(reversed(st.session_state.chat_history), 1):
+                st.markdown(f"**Q{i}:** {chat['question']}")
+                st.markdown(f"**A{i}:** {chat['answer']}")
+        else:
+            st.info("No questions asked yet.")
+        # Clear history button
+        if st.button("🗑️ Clear Chat"):
+            st.session_state.chat_history = []
+    # Load retriever and LLM
+    retriever, llm = get_retriever_and_llm()
+    # Main interaction
+    question = st.text_input("💬 Type your question:", placeholder="e.g. Explain paging in OS.")
+    if st.button("Get Answer") and question.strip():
+        with st.spinner("Thinking..."):
+            answer = generate_answer(question, retriever, llm)
+            st.session_state.chat_history.append({"question": question, "answer": answer})
+            st.markdown("### ✅ Answer")
+            st.success(answer)
+if __name__ == "__main__":
+    main()

create_memory_for_llm.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import FAISS
+from langchain_huggingface import HuggingFaceEmbeddings
+import os
+# Step 1: load raw PDF(s)
+DATA_PATH = "data/"
+FAISS_DB_PATH = "vectorstore/faiss_db"
+EMBEDDING_MODEL_NAME = "intfloat/e5-small"
+CHUNK_SIZE = 500
+CHUNK_OVERLAP = 50
+def load_pdf_files(data_path):
+    loader = DirectoryLoader(
+        data_path,
+        glob = "*.pdf",
+        loader_cls = PyMuPDFLoader
+    )
+    documents = loader.load()
+    print(f"[INFO] Loaded {len(documents)} Pages from PDF Files.")
+    return documents
+def create_chunks(documents,chunk_size = CHUNK_SIZE,chunk_overlap = CHUNK_OVERLAP):
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size = chunk_size,
+        chunk_overlap = chunk_overlap,
+    )
+    chunks = splitter.split_documents(documents=documents)
+    print(f"[INFO] Created {len(chunks)} Chunks.")
+    return chunks
+def format_e5_chunks(chunks):
+    for chunk in chunks:
+        chunk.page_content = "passage: "+ chunk.page_content
+    return chunks
+def get_embedding_model(model_name):
+    model_kwargs = {"device":"cpu"}
+    return HuggingFaceEmbeddings(model_name=model_name,model_kwargs=model_kwargs)
+def store_in_faiss(chunks, embedding_model, persist_path):
+    if not os.path.exists(persist_path):
+        os.makedirs(persist_path)
+    db = FAISS.from_documents(
+        documents=chunks,
+        embedding=embedding_model
+    )
+    db.save_local(persist_path)
+    print(f"[INFO] FAISS Vector DB saved at: {persist_path}")
+def main():
+    documents = load_pdf_files(DATA_PATH)
+    chunks = create_chunks(documents= documents)
+    formatted_chunks = format_e5_chunks(chunks)
+    embedding_model = get_embedding_model(EMBEDDING_MODEL_NAME)
+    store_in_faiss(formatted_chunks,embedding_model,FAISS_DB_PATH)
+if __name__ == "__main__":
+    main()

data/(CN 1) Computer Network.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d71e4f9a669ec8dc8164f0d7742022c75278743ee54fb14c16ead300852fbeb
+size 6169180

data/(CN 2) DATA_COMMUNICATIONS_AND_NETWORKING_McGra.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7b3184a02a0f88c7769e9e1d22ecba242315ce41ce6d7daf1fd06b51dc728a7
+size 11611941

data/(DSA) Introduction to Algorithms - 3rd Edition.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:780800adc787535f1aba5083d71cf42384fbb80d40a4e2aea68e827092a6e32e
+size 5076764

data/(OS) Operating System Concepts (9th Ed) - Gagne, Silberschatz, and Galvin.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32d9d9edd7f373464c74e810f9c9dd5d94d770e479ee469213c5fe3da8a0f6b1
+size 5477883

data/CAO Hamacher.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:59290ee2a90e714bfb1920fc680c1630c3e9a05c36b75fc8f218aa7e9053a193
+size 3149031

data/DBMS Korth.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4caafc657ce455e63e6f3a7b2992f81c49a668ca2ec7d83d1163b85d9c8da910
+size 17297558

data/Discrete Mathematics and Its Applications Seventh Edition by Kenneth H. Rosen.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c962d80120044e86cbadcf01211e0cd1997be2fbe6c48837caee5387662528ce
+size 21332313

data/TOC.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1773452ebf526f59da83abcf41e5b3f2de1ddc8504b7d11297a9f3a610656a41
+size 21979810

models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fecc3b3cd76bba89d504f29b616eedf7da85b96540e490ca5824d3f7d2776a0
+size 668788096

rag_qa.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from langchain_community.vectorstores import FAISS
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain.prompts import PromptTemplate
+from langchain.chains import RetrievalQA
+from llama_cpp import Llama
+import os
+FAISS_DB_PATH = "vectorstore/faiss_db"
+EMBEDDING_MODEL_NAME = "intfloat/e5-small"
+LLM_MODEL_PATH = "models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
+def load_vectorstore(path,embedding_model_name):
+    embeddings = HuggingFaceEmbeddings(
+        model_name = embedding_model_name,
+        model_kwargs = {"device":"cpu"}
+    )
+    vectorstore = FAISS.load_local(path,embeddings,allow_dangerous_deserialization=True)
+    return vectorstore
+def load_llm(model_path):
+    return Llama(
+        model_path = model_path,
+        n_ctx = 2048,
+        n_batch = 64,
+        n_threads = 4,
+        verbose = False
+        )
+def generate_answer(question, retriever, llm):
+    docs = retriever.invoke(question)
+    context = "\n\n".join(doc.page_content for doc in docs)
+    prompt = PROMPT_TEMPLATE.format(context=context, question=question)
+    response = llm(prompt, max_tokens=512, stop=["Question:", "Context:"])
+    return response["choices"][0]["text"].strip()
+PROMPT_TEMPLATE = """
+You are an AI tutor helping students prepare for the GATE CSE exam. Use the provided textbook-based context to answer the question accurately.
+Instructions:
+- Base your answer only on the context provided below. If the answer is not in the context, say "The context does not provide enough information to answer this question."
+- For **comparative questions**, respond using a **markdown table** with clearly labeled headers and rows.
+- Be **concise**, **accurate**, and **easy to understand**.
+- Avoid repeating the question or instructions in the answer.
+- Use bullet points or tables if that improves clarity.
+Context:
+{context}
+Question:
+{question}
+"""
+def main():
+    vectorstore = load_vectorstore(FAISS_DB_PATH,EMBEDDING_MODEL_NAME)
+    retriever = vectorstore.as_retriever(search_kwargs = {"k":5})
+    llm = load_llm(LLM_MODEL_PATH)
+    question = input("Ask your GATE-CSE related question: ")
+    docs = retriever.invoke(question)
+    context = "\n\n".join(doc.page_content for doc in docs)
+    prompt = PROMPT_TEMPLATE.format(context = context,question = question)
+    response = llm(prompt,max_tokens = 512,stop =   ["Question:","Context:"])
+    print("\n--- Answer ---\n")
+    print(response["choices"][0]["text"].strip())
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+streamlit
+langchain
+langchain-community
+langchain-huggingface
+huggingface-hub
+transformers
+torch
+faiss-cpu
+llama-cpp-python
+sentence-transformers
+tqdm

vectorstore/faiss_db/index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f1ba3fa56a742fbb651fdbe56f8e57133bb8b9571a47d3dd8943448270a6fec
+size 54082605

vectorstore/faiss_db/index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7321e193ed6623f5e46c6b3a41324b721d8972253ec474899dbc92b793bc65c4
+size 23933172