Priyank1001 commited on
Commit
71ce55a
·
verified ·
1 Parent(s): 35bf616

initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/(CN[[:space:]]1)[[:space:]][[:space:]]Computer[[:space:]]Network.pdf filter=lfs diff=lfs merge=lfs -text
37
+ data/(CN[[:space:]]2)[[:space:]]DATA_COMMUNICATIONS_AND_NETWORKING_McGra.pdf filter=lfs diff=lfs merge=lfs -text
38
+ data/(DSA)[[:space:]]Introduction[[:space:]]to[[:space:]]Algorithms[[:space:]]-[[:space:]]3rd[[:space:]]Edition.pdf filter=lfs diff=lfs merge=lfs -text
39
+ data/(OS)[[:space:]]Operating[[:space:]]System[[:space:]]Concepts[[:space:]](9th[[:space:]]Ed)[[:space:]]-[[:space:]]Gagne,[[:space:]]Silberschatz,[[:space:]]and[[:space:]]Galvin.pdf filter=lfs diff=lfs merge=lfs -text
40
+ data/CAO[[:space:]]Hamacher.pdf filter=lfs diff=lfs merge=lfs -text
41
+ data/DBMS[[:space:]]Korth.pdf filter=lfs diff=lfs merge=lfs -text
42
+ data/Discrete[[:space:]]Mathematics[[:space:]]and[[:space:]]Its[[:space:]]Applications[[:space:]]Seventh[[:space:]]Edition[[:space:]]by[[:space:]]Kenneth[[:space:]]H.[[:space:]]Rosen.pdf filter=lfs diff=lfs merge=lfs -text
43
+ data/TOC.pdf filter=lfs diff=lfs merge=lfs -text
44
+ models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
45
+ vectorstore/faiss_db/index.faiss filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from rag_qa import load_vectorstore, load_llm, generate_answer
3
+
4
+ FAISS_DB_PATH = "vectorstore/faiss_db"
5
+ LLM_MODEL_PATH = "models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
6
+ EMBEDDING_MODEL_NAME = "intfloat/e5-small"
7
+
8
+ @st.cache_resource(show_spinner=False)
9
+ def get_retriever_and_llm():
10
+ vectorstore = load_vectorstore(FAISS_DB_PATH, EMBEDDING_MODEL_NAME)
11
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
12
+ llm = load_llm(LLM_MODEL_PATH)
13
+ return retriever, llm
14
+
15
+ def main():
16
+ st.set_page_config(page_title="GATE CSE Assistant", page_icon="🎓")
17
+ st.markdown("<h1 style='text-align: center;'>🎓 GATE CSE Assistant</h1>", unsafe_allow_html=True)
18
+ st.markdown("<p style='text-align: center;'>Ask any technical question related to GATE Computer Science syllabus.</p>", unsafe_allow_html=True)
19
+
20
+ # Session state for chat history
21
+ if "chat_history" not in st.session_state:
22
+ st.session_state.chat_history = []
23
+
24
+ # Sidebar Chat History
25
+ with st.sidebar:
26
+ st.title("🧾 Chat History")
27
+ if st.session_state.chat_history:
28
+ for i, chat in enumerate(reversed(st.session_state.chat_history), 1):
29
+ st.markdown(f"**Q{i}:** {chat['question']}")
30
+ st.markdown(f"**A{i}:** {chat['answer']}")
31
+ else:
32
+ st.info("No questions asked yet.")
33
+
34
+ # Clear history button
35
+ if st.button("🗑️ Clear Chat"):
36
+ st.session_state.chat_history = []
37
+
38
+ # Load retriever and LLM
39
+ retriever, llm = get_retriever_and_llm()
40
+
41
+ # Main interaction
42
+ question = st.text_input("💬 Type your question:", placeholder="e.g. Explain paging in OS.")
43
+ if st.button("Get Answer") and question.strip():
44
+ with st.spinner("Thinking..."):
45
+ answer = generate_answer(question, retriever, llm)
46
+ st.session_state.chat_history.append({"question": question, "answer": answer})
47
+ st.markdown("### ✅ Answer")
48
+ st.success(answer)
49
+
50
+ if __name__ == "__main__":
51
+ main()
create_memory_for_llm.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain_community.vectorstores import FAISS
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ import os
6
+ # Step 1: load raw PDF(s)
7
+ DATA_PATH = "data/"
8
+ FAISS_DB_PATH = "vectorstore/faiss_db"
9
+ EMBEDDING_MODEL_NAME = "intfloat/e5-small"
10
+ CHUNK_SIZE = 500
11
+ CHUNK_OVERLAP = 50
12
+
13
+ def load_pdf_files(data_path):
14
+ loader = DirectoryLoader(
15
+ data_path,
16
+ glob = "*.pdf",
17
+ loader_cls = PyMuPDFLoader
18
+ )
19
+ documents = loader.load()
20
+ print(f"[INFO] Loaded {len(documents)} Pages from PDF Files.")
21
+ return documents
22
+
23
+ def create_chunks(documents,chunk_size = CHUNK_SIZE,chunk_overlap = CHUNK_OVERLAP):
24
+ splitter = RecursiveCharacterTextSplitter(
25
+ chunk_size = chunk_size,
26
+ chunk_overlap = chunk_overlap,
27
+ )
28
+ chunks = splitter.split_documents(documents=documents)
29
+
30
+ print(f"[INFO] Created {len(chunks)} Chunks.")
31
+ return chunks
32
+
33
+ def format_e5_chunks(chunks):
34
+ for chunk in chunks:
35
+ chunk.page_content = "passage: "+ chunk.page_content
36
+
37
+ return chunks
38
+
39
+ def get_embedding_model(model_name):
40
+ model_kwargs = {"device":"cpu"}
41
+ return HuggingFaceEmbeddings(model_name=model_name,model_kwargs=model_kwargs)
42
+
43
+ def store_in_faiss(chunks, embedding_model, persist_path):
44
+ if not os.path.exists(persist_path):
45
+ os.makedirs(persist_path)
46
+
47
+ db = FAISS.from_documents(
48
+ documents=chunks,
49
+ embedding=embedding_model
50
+ )
51
+ db.save_local(persist_path)
52
+ print(f"[INFO] FAISS Vector DB saved at: {persist_path}")
53
+
54
+
55
+ def main():
56
+ documents = load_pdf_files(DATA_PATH)
57
+ chunks = create_chunks(documents= documents)
58
+ formatted_chunks = format_e5_chunks(chunks)
59
+ embedding_model = get_embedding_model(EMBEDDING_MODEL_NAME)
60
+ store_in_faiss(formatted_chunks,embedding_model,FAISS_DB_PATH)
61
+
62
+ if __name__ == "__main__":
63
+ main()
data/(CN 1) Computer Network.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d71e4f9a669ec8dc8164f0d7742022c75278743ee54fb14c16ead300852fbeb
3
+ size 6169180
data/(CN 2) DATA_COMMUNICATIONS_AND_NETWORKING_McGra.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7b3184a02a0f88c7769e9e1d22ecba242315ce41ce6d7daf1fd06b51dc728a7
3
+ size 11611941
data/(DSA) Introduction to Algorithms - 3rd Edition.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:780800adc787535f1aba5083d71cf42384fbb80d40a4e2aea68e827092a6e32e
3
+ size 5076764
data/(OS) Operating System Concepts (9th Ed) - Gagne, Silberschatz, and Galvin.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32d9d9edd7f373464c74e810f9c9dd5d94d770e479ee469213c5fe3da8a0f6b1
3
+ size 5477883
data/CAO Hamacher.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59290ee2a90e714bfb1920fc680c1630c3e9a05c36b75fc8f218aa7e9053a193
3
+ size 3149031
data/DBMS Korth.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4caafc657ce455e63e6f3a7b2992f81c49a668ca2ec7d83d1163b85d9c8da910
3
+ size 17297558
data/Discrete Mathematics and Its Applications Seventh Edition by Kenneth H. Rosen.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c962d80120044e86cbadcf01211e0cd1997be2fbe6c48837caee5387662528ce
3
+ size 21332313
data/TOC.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1773452ebf526f59da83abcf41e5b3f2de1ddc8504b7d11297a9f3a610656a41
3
+ size 21979810
models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fecc3b3cd76bba89d504f29b616eedf7da85b96540e490ca5824d3f7d2776a0
3
+ size 668788096
rag_qa.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.vectorstores import FAISS
2
+ from langchain_huggingface import HuggingFaceEmbeddings
3
+ from langchain.prompts import PromptTemplate
4
+ from langchain.chains import RetrievalQA
5
+ from llama_cpp import Llama
6
+ import os
7
+
8
+ FAISS_DB_PATH = "vectorstore/faiss_db"
9
+ EMBEDDING_MODEL_NAME = "intfloat/e5-small"
10
+ LLM_MODEL_PATH = "models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
11
+
12
+ def load_vectorstore(path,embedding_model_name):
13
+ embeddings = HuggingFaceEmbeddings(
14
+ model_name = embedding_model_name,
15
+ model_kwargs = {"device":"cpu"}
16
+ )
17
+
18
+ vectorstore = FAISS.load_local(path,embeddings,allow_dangerous_deserialization=True)
19
+ return vectorstore
20
+
21
+ def load_llm(model_path):
22
+ return Llama(
23
+ model_path = model_path,
24
+ n_ctx = 2048,
25
+ n_batch = 64,
26
+ n_threads = 4,
27
+ verbose = False
28
+ )
29
+
30
+ def generate_answer(question, retriever, llm):
31
+ docs = retriever.invoke(question)
32
+ context = "\n\n".join(doc.page_content for doc in docs)
33
+
34
+ prompt = PROMPT_TEMPLATE.format(context=context, question=question)
35
+
36
+ response = llm(prompt, max_tokens=512, stop=["Question:", "Context:"])
37
+ return response["choices"][0]["text"].strip()
38
+
39
+
40
+ PROMPT_TEMPLATE = """
41
+ You are an AI tutor helping students prepare for the GATE CSE exam. Use the provided textbook-based context to answer the question accurately.
42
+
43
+ Instructions:
44
+ - Base your answer only on the context provided below. If the answer is not in the context, say "The context does not provide enough information to answer this question."
45
+ - For **comparative questions**, respond using a **markdown table** with clearly labeled headers and rows.
46
+ - Be **concise**, **accurate**, and **easy to understand**.
47
+ - Avoid repeating the question or instructions in the answer.
48
+ - Use bullet points or tables if that improves clarity.
49
+
50
+ Context:
51
+ {context}
52
+
53
+ Question:
54
+ {question}
55
+ """
56
+
57
+
58
+
59
+ def main():
60
+
61
+ vectorstore = load_vectorstore(FAISS_DB_PATH,EMBEDDING_MODEL_NAME)
62
+ retriever = vectorstore.as_retriever(search_kwargs = {"k":5})
63
+ llm = load_llm(LLM_MODEL_PATH)
64
+
65
+ question = input("Ask your GATE-CSE related question: ")
66
+
67
+ docs = retriever.invoke(question)
68
+ context = "\n\n".join(doc.page_content for doc in docs)
69
+
70
+ prompt = PROMPT_TEMPLATE.format(context = context,question = question)
71
+
72
+
73
+ response = llm(prompt,max_tokens = 512,stop = ["Question:","Context:"])
74
+ print("\n--- Answer ---\n")
75
+ print(response["choices"][0]["text"].strip())
76
+
77
+
78
+ if __name__ == "__main__":
79
+ main()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ langchain
3
+ langchain-community
4
+ langchain-huggingface
5
+ huggingface-hub
6
+ transformers
7
+ torch
8
+ faiss-cpu
9
+ llama-cpp-python
10
+ sentence-transformers
11
+ tqdm
vectorstore/faiss_db/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f1ba3fa56a742fbb651fdbe56f8e57133bb8b9571a47d3dd8943448270a6fec
3
+ size 54082605
vectorstore/faiss_db/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7321e193ed6623f5e46c6b3a41324b721d8972253ec474899dbc92b793bc65c4
3
+ size 23933172