Spaces:
Build error
Build error
initial commit
Browse files- .gitattributes +10 -0
- app.py +51 -0
- create_memory_for_llm.py +63 -0
- data/(CN 1) Computer Network.pdf +3 -0
- data/(CN 2) DATA_COMMUNICATIONS_AND_NETWORKING_McGra.pdf +3 -0
- data/(DSA) Introduction to Algorithms - 3rd Edition.pdf +3 -0
- data/(OS) Operating System Concepts (9th Ed) - Gagne, Silberschatz, and Galvin.pdf +3 -0
- data/CAO Hamacher.pdf +3 -0
- data/DBMS Korth.pdf +3 -0
- data/Discrete Mathematics and Its Applications Seventh Edition by Kenneth H. Rosen.pdf +3 -0
- data/TOC.pdf +3 -0
- models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf +3 -0
- rag_qa.py +79 -0
- requirements.txt +11 -0
- vectorstore/faiss_db/index.faiss +3 -0
- vectorstore/faiss_db/index.pkl +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data/(CN[[:space:]]1)[[:space:]][[:space:]]Computer[[:space:]]Network.pdf filter=lfs diff=lfs merge=lfs -text
|
37 |
+
data/(CN[[:space:]]2)[[:space:]]DATA_COMMUNICATIONS_AND_NETWORKING_McGra.pdf filter=lfs diff=lfs merge=lfs -text
|
38 |
+
data/(DSA)[[:space:]]Introduction[[:space:]]to[[:space:]]Algorithms[[:space:]]-[[:space:]]3rd[[:space:]]Edition.pdf filter=lfs diff=lfs merge=lfs -text
|
39 |
+
data/(OS)[[:space:]]Operating[[:space:]]System[[:space:]]Concepts[[:space:]](9th[[:space:]]Ed)[[:space:]]-[[:space:]]Gagne,[[:space:]]Silberschatz,[[:space:]]and[[:space:]]Galvin.pdf filter=lfs diff=lfs merge=lfs -text
|
40 |
+
data/CAO[[:space:]]Hamacher.pdf filter=lfs diff=lfs merge=lfs -text
|
41 |
+
data/DBMS[[:space:]]Korth.pdf filter=lfs diff=lfs merge=lfs -text
|
42 |
+
data/Discrete[[:space:]]Mathematics[[:space:]]and[[:space:]]Its[[:space:]]Applications[[:space:]]Seventh[[:space:]]Edition[[:space:]]by[[:space:]]Kenneth[[:space:]]H.[[:space:]]Rosen.pdf filter=lfs diff=lfs merge=lfs -text
|
43 |
+
data/TOC.pdf filter=lfs diff=lfs merge=lfs -text
|
44 |
+
models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
|
45 |
+
vectorstore/faiss_db/index.faiss filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from rag_qa import load_vectorstore, load_llm, generate_answer
|
3 |
+
|
4 |
+
FAISS_DB_PATH = "vectorstore/faiss_db"
|
5 |
+
LLM_MODEL_PATH = "models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
|
6 |
+
EMBEDDING_MODEL_NAME = "intfloat/e5-small"
|
7 |
+
|
8 |
+
@st.cache_resource(show_spinner=False)
|
9 |
+
def get_retriever_and_llm():
|
10 |
+
vectorstore = load_vectorstore(FAISS_DB_PATH, EMBEDDING_MODEL_NAME)
|
11 |
+
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
|
12 |
+
llm = load_llm(LLM_MODEL_PATH)
|
13 |
+
return retriever, llm
|
14 |
+
|
15 |
+
def main():
|
16 |
+
st.set_page_config(page_title="GATE CSE Assistant", page_icon="🎓")
|
17 |
+
st.markdown("<h1 style='text-align: center;'>🎓 GATE CSE Assistant</h1>", unsafe_allow_html=True)
|
18 |
+
st.markdown("<p style='text-align: center;'>Ask any technical question related to GATE Computer Science syllabus.</p>", unsafe_allow_html=True)
|
19 |
+
|
20 |
+
# Session state for chat history
|
21 |
+
if "chat_history" not in st.session_state:
|
22 |
+
st.session_state.chat_history = []
|
23 |
+
|
24 |
+
# Sidebar Chat History
|
25 |
+
with st.sidebar:
|
26 |
+
st.title("🧾 Chat History")
|
27 |
+
if st.session_state.chat_history:
|
28 |
+
for i, chat in enumerate(reversed(st.session_state.chat_history), 1):
|
29 |
+
st.markdown(f"**Q{i}:** {chat['question']}")
|
30 |
+
st.markdown(f"**A{i}:** {chat['answer']}")
|
31 |
+
else:
|
32 |
+
st.info("No questions asked yet.")
|
33 |
+
|
34 |
+
# Clear history button
|
35 |
+
if st.button("🗑️ Clear Chat"):
|
36 |
+
st.session_state.chat_history = []
|
37 |
+
|
38 |
+
# Load retriever and LLM
|
39 |
+
retriever, llm = get_retriever_and_llm()
|
40 |
+
|
41 |
+
# Main interaction
|
42 |
+
question = st.text_input("💬 Type your question:", placeholder="e.g. Explain paging in OS.")
|
43 |
+
if st.button("Get Answer") and question.strip():
|
44 |
+
with st.spinner("Thinking..."):
|
45 |
+
answer = generate_answer(question, retriever, llm)
|
46 |
+
st.session_state.chat_history.append({"question": question, "answer": answer})
|
47 |
+
st.markdown("### ✅ Answer")
|
48 |
+
st.success(answer)
|
49 |
+
|
50 |
+
if __name__ == "__main__":
|
51 |
+
main()
|
create_memory_for_llm.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
from langchain_community.vectorstores import FAISS
|
4 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
5 |
+
import os
|
6 |
+
# Step 1: load raw PDF(s)
|
7 |
+
DATA_PATH = "data/"
|
8 |
+
FAISS_DB_PATH = "vectorstore/faiss_db"
|
9 |
+
EMBEDDING_MODEL_NAME = "intfloat/e5-small"
|
10 |
+
CHUNK_SIZE = 500
|
11 |
+
CHUNK_OVERLAP = 50
|
12 |
+
|
13 |
+
def load_pdf_files(data_path):
|
14 |
+
loader = DirectoryLoader(
|
15 |
+
data_path,
|
16 |
+
glob = "*.pdf",
|
17 |
+
loader_cls = PyMuPDFLoader
|
18 |
+
)
|
19 |
+
documents = loader.load()
|
20 |
+
print(f"[INFO] Loaded {len(documents)} Pages from PDF Files.")
|
21 |
+
return documents
|
22 |
+
|
23 |
+
def create_chunks(documents,chunk_size = CHUNK_SIZE,chunk_overlap = CHUNK_OVERLAP):
|
24 |
+
splitter = RecursiveCharacterTextSplitter(
|
25 |
+
chunk_size = chunk_size,
|
26 |
+
chunk_overlap = chunk_overlap,
|
27 |
+
)
|
28 |
+
chunks = splitter.split_documents(documents=documents)
|
29 |
+
|
30 |
+
print(f"[INFO] Created {len(chunks)} Chunks.")
|
31 |
+
return chunks
|
32 |
+
|
33 |
+
def format_e5_chunks(chunks):
|
34 |
+
for chunk in chunks:
|
35 |
+
chunk.page_content = "passage: "+ chunk.page_content
|
36 |
+
|
37 |
+
return chunks
|
38 |
+
|
39 |
+
def get_embedding_model(model_name):
|
40 |
+
model_kwargs = {"device":"cpu"}
|
41 |
+
return HuggingFaceEmbeddings(model_name=model_name,model_kwargs=model_kwargs)
|
42 |
+
|
43 |
+
def store_in_faiss(chunks, embedding_model, persist_path):
|
44 |
+
if not os.path.exists(persist_path):
|
45 |
+
os.makedirs(persist_path)
|
46 |
+
|
47 |
+
db = FAISS.from_documents(
|
48 |
+
documents=chunks,
|
49 |
+
embedding=embedding_model
|
50 |
+
)
|
51 |
+
db.save_local(persist_path)
|
52 |
+
print(f"[INFO] FAISS Vector DB saved at: {persist_path}")
|
53 |
+
|
54 |
+
|
55 |
+
def main():
|
56 |
+
documents = load_pdf_files(DATA_PATH)
|
57 |
+
chunks = create_chunks(documents= documents)
|
58 |
+
formatted_chunks = format_e5_chunks(chunks)
|
59 |
+
embedding_model = get_embedding_model(EMBEDDING_MODEL_NAME)
|
60 |
+
store_in_faiss(formatted_chunks,embedding_model,FAISS_DB_PATH)
|
61 |
+
|
62 |
+
if __name__ == "__main__":
|
63 |
+
main()
|
data/(CN 1) Computer Network.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5d71e4f9a669ec8dc8164f0d7742022c75278743ee54fb14c16ead300852fbeb
|
3 |
+
size 6169180
|
data/(CN 2) DATA_COMMUNICATIONS_AND_NETWORKING_McGra.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e7b3184a02a0f88c7769e9e1d22ecba242315ce41ce6d7daf1fd06b51dc728a7
|
3 |
+
size 11611941
|
data/(DSA) Introduction to Algorithms - 3rd Edition.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:780800adc787535f1aba5083d71cf42384fbb80d40a4e2aea68e827092a6e32e
|
3 |
+
size 5076764
|
data/(OS) Operating System Concepts (9th Ed) - Gagne, Silberschatz, and Galvin.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:32d9d9edd7f373464c74e810f9c9dd5d94d770e479ee469213c5fe3da8a0f6b1
|
3 |
+
size 5477883
|
data/CAO Hamacher.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:59290ee2a90e714bfb1920fc680c1630c3e9a05c36b75fc8f218aa7e9053a193
|
3 |
+
size 3149031
|
data/DBMS Korth.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4caafc657ce455e63e6f3a7b2992f81c49a668ca2ec7d83d1163b85d9c8da910
|
3 |
+
size 17297558
|
data/Discrete Mathematics and Its Applications Seventh Edition by Kenneth H. Rosen.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c962d80120044e86cbadcf01211e0cd1997be2fbe6c48837caee5387662528ce
|
3 |
+
size 21332313
|
data/TOC.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1773452ebf526f59da83abcf41e5b3f2de1ddc8504b7d11297a9f3a610656a41
|
3 |
+
size 21979810
|
models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9fecc3b3cd76bba89d504f29b616eedf7da85b96540e490ca5824d3f7d2776a0
|
3 |
+
size 668788096
|
rag_qa.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.vectorstores import FAISS
|
2 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
3 |
+
from langchain.prompts import PromptTemplate
|
4 |
+
from langchain.chains import RetrievalQA
|
5 |
+
from llama_cpp import Llama
|
6 |
+
import os
|
7 |
+
|
8 |
+
FAISS_DB_PATH = "vectorstore/faiss_db"
|
9 |
+
EMBEDDING_MODEL_NAME = "intfloat/e5-small"
|
10 |
+
LLM_MODEL_PATH = "models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
|
11 |
+
|
12 |
+
def load_vectorstore(path,embedding_model_name):
|
13 |
+
embeddings = HuggingFaceEmbeddings(
|
14 |
+
model_name = embedding_model_name,
|
15 |
+
model_kwargs = {"device":"cpu"}
|
16 |
+
)
|
17 |
+
|
18 |
+
vectorstore = FAISS.load_local(path,embeddings,allow_dangerous_deserialization=True)
|
19 |
+
return vectorstore
|
20 |
+
|
21 |
+
def load_llm(model_path):
|
22 |
+
return Llama(
|
23 |
+
model_path = model_path,
|
24 |
+
n_ctx = 2048,
|
25 |
+
n_batch = 64,
|
26 |
+
n_threads = 4,
|
27 |
+
verbose = False
|
28 |
+
)
|
29 |
+
|
30 |
+
def generate_answer(question, retriever, llm):
|
31 |
+
docs = retriever.invoke(question)
|
32 |
+
context = "\n\n".join(doc.page_content for doc in docs)
|
33 |
+
|
34 |
+
prompt = PROMPT_TEMPLATE.format(context=context, question=question)
|
35 |
+
|
36 |
+
response = llm(prompt, max_tokens=512, stop=["Question:", "Context:"])
|
37 |
+
return response["choices"][0]["text"].strip()
|
38 |
+
|
39 |
+
|
40 |
+
PROMPT_TEMPLATE = """
|
41 |
+
You are an AI tutor helping students prepare for the GATE CSE exam. Use the provided textbook-based context to answer the question accurately.
|
42 |
+
|
43 |
+
Instructions:
|
44 |
+
- Base your answer only on the context provided below. If the answer is not in the context, say "The context does not provide enough information to answer this question."
|
45 |
+
- For **comparative questions**, respond using a **markdown table** with clearly labeled headers and rows.
|
46 |
+
- Be **concise**, **accurate**, and **easy to understand**.
|
47 |
+
- Avoid repeating the question or instructions in the answer.
|
48 |
+
- Use bullet points or tables if that improves clarity.
|
49 |
+
|
50 |
+
Context:
|
51 |
+
{context}
|
52 |
+
|
53 |
+
Question:
|
54 |
+
{question}
|
55 |
+
"""
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
def main():
|
60 |
+
|
61 |
+
vectorstore = load_vectorstore(FAISS_DB_PATH,EMBEDDING_MODEL_NAME)
|
62 |
+
retriever = vectorstore.as_retriever(search_kwargs = {"k":5})
|
63 |
+
llm = load_llm(LLM_MODEL_PATH)
|
64 |
+
|
65 |
+
question = input("Ask your GATE-CSE related question: ")
|
66 |
+
|
67 |
+
docs = retriever.invoke(question)
|
68 |
+
context = "\n\n".join(doc.page_content for doc in docs)
|
69 |
+
|
70 |
+
prompt = PROMPT_TEMPLATE.format(context = context,question = question)
|
71 |
+
|
72 |
+
|
73 |
+
response = llm(prompt,max_tokens = 512,stop = ["Question:","Context:"])
|
74 |
+
print("\n--- Answer ---\n")
|
75 |
+
print(response["choices"][0]["text"].strip())
|
76 |
+
|
77 |
+
|
78 |
+
if __name__ == "__main__":
|
79 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
langchain
|
3 |
+
langchain-community
|
4 |
+
langchain-huggingface
|
5 |
+
huggingface-hub
|
6 |
+
transformers
|
7 |
+
torch
|
8 |
+
faiss-cpu
|
9 |
+
llama-cpp-python
|
10 |
+
sentence-transformers
|
11 |
+
tqdm
|
vectorstore/faiss_db/index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7f1ba3fa56a742fbb651fdbe56f8e57133bb8b9571a47d3dd8943448270a6fec
|
3 |
+
size 54082605
|
vectorstore/faiss_db/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7321e193ed6623f5e46c6b3a41324b721d8972253ec474899dbc92b793bc65c4
|
3 |
+
size 23933172
|