Spaces:
Sleeping
Sleeping
Update rag_pipeline.py
Browse files- rag_pipeline.py +18 -22
rag_pipeline.py
CHANGED
@@ -3,37 +3,33 @@ from langchain_community.document_loaders import PyMuPDFLoader
|
|
3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
from langchain_community.vectorstores import Chroma
|
5 |
from langchain_huggingface import HuggingFaceEmbeddings
|
6 |
-
from transformers import pipeline
|
7 |
|
8 |
-
#
|
9 |
-
def
|
10 |
-
|
11 |
-
pdf_dir.mkdir(exist_ok=True)
|
12 |
docs = []
|
13 |
-
for
|
14 |
-
loader = PyMuPDFLoader(str(
|
15 |
docs.extend(loader.load())
|
16 |
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
17 |
-
|
|
|
|
|
|
|
18 |
|
19 |
-
|
20 |
-
db = Chroma.from_documents(split_docs, embedding, persist_directory="chroma_db")
|
21 |
-
return db.as_retriever(search_kwargs={"k": 5})
|
22 |
|
23 |
-
#
|
24 |
qa_pipeline = pipeline(
|
25 |
"question-answering",
|
26 |
-
model=
|
27 |
-
tokenizer=
|
28 |
device=-1
|
29 |
)
|
30 |
|
31 |
-
|
32 |
-
retriever = load_and_index()
|
33 |
-
|
34 |
-
# Perform retrieval + QA
|
35 |
-
def answer_question(question: str) -> str:
|
36 |
docs = retriever.get_relevant_documents(question)
|
37 |
-
context = "\n\n".join(
|
38 |
-
|
39 |
-
return
|
|
|
3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
from langchain_community.vectorstores import Chroma
|
5 |
from langchain_huggingface import HuggingFaceEmbeddings
|
6 |
+
from transformers import pipeline
|
7 |
|
8 |
+
# Retriever for top-5 relevant document chunks
|
9 |
+
def init_retriever():
|
10 |
+
Path("data").mkdir(exist_ok=True)
|
|
|
11 |
docs = []
|
12 |
+
for pdf in Path("data").glob("*.pdf"):
|
13 |
+
loader = PyMuPDFLoader(str(pdf))
|
14 |
docs.extend(loader.load())
|
15 |
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
16 |
+
chunks = splitter.split_documents(docs)
|
17 |
+
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/LaBSE", model_kwargs={"device": "cpu"})
|
18 |
+
vectordb = Chroma.from_documents(chunks, embeddings, persist_directory="chroma_db")
|
19 |
+
return vectordb.as_retriever(search_kwargs={"k": 5})
|
20 |
|
21 |
+
retriever = init_retriever()
|
|
|
|
|
22 |
|
23 |
+
# Arabic QA pipeline (extractive)
|
24 |
qa_pipeline = pipeline(
|
25 |
"question-answering",
|
26 |
+
model="ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA",
|
27 |
+
tokenizer="ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA",
|
28 |
device=-1
|
29 |
)
|
30 |
|
31 |
+
def answer(question: str) -> str:
|
|
|
|
|
|
|
|
|
32 |
docs = retriever.get_relevant_documents(question)
|
33 |
+
context = "\n\n".join(d.page_content for d in docs)
|
34 |
+
out = qa_pipeline(question=question, context=context)
|
35 |
+
return out.get("answer", "عفواً، لم أجد إجابة واضحة.")
|