ramysaidagieb commited on
Commit
07f5718
·
verified ·
1 Parent(s): 3dc1a7f

Update rag_pipeline.py

Browse files
Files changed (1) hide show
  1. rag_pipeline.py +18 -22
rag_pipeline.py CHANGED
@@ -3,37 +3,33 @@ from langchain_community.document_loaders import PyMuPDFLoader
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain_community.vectorstores import Chroma
5
  from langchain_huggingface import HuggingFaceEmbeddings
6
- from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
7
 
8
- # Load and index Arabic documents
9
- def load_and_index():
10
- pdf_dir = Path("data")
11
- pdf_dir.mkdir(exist_ok=True)
12
  docs = []
13
- for pdf_file in pdf_dir.glob("*.pdf"):
14
- loader = PyMuPDFLoader(str(pdf_file))
15
  docs.extend(loader.load())
16
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
17
- split_docs = splitter.split_documents(docs)
 
 
 
18
 
19
- embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/LaBSE", model_kwargs={"device": "cpu"})
20
- db = Chroma.from_documents(split_docs, embedding, persist_directory="chroma_db")
21
- return db.as_retriever(search_kwargs={"k": 5})
22
 
23
- # Load Arabic QA model
24
  qa_pipeline = pipeline(
25
  "question-answering",
26
- model=AutoModelForQuestionAnswering.from_pretrained("alyaa82/aravec-bert-base-qa"),
27
- tokenizer=AutoTokenizer.from_pretrained("alyaa82/aravec-bert-base-qa"),
28
  device=-1
29
  )
30
 
31
- # Get retriever once
32
- retriever = load_and_index()
33
-
34
- # Perform retrieval + QA
35
- def answer_question(question: str) -> str:
36
  docs = retriever.get_relevant_documents(question)
37
- context = "\n\n".join(doc.page_content for doc in docs)
38
- result = qa_pipeline(question=question, context=context)
39
- return result['answer']
 
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain_community.vectorstores import Chroma
5
  from langchain_huggingface import HuggingFaceEmbeddings
6
+ from transformers import pipeline
7
 
8
+ # Retriever for top-5 relevant document chunks
9
+ def init_retriever():
10
+ Path("data").mkdir(exist_ok=True)
 
11
  docs = []
12
+ for pdf in Path("data").glob("*.pdf"):
13
+ loader = PyMuPDFLoader(str(pdf))
14
  docs.extend(loader.load())
15
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
16
+ chunks = splitter.split_documents(docs)
17
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/LaBSE", model_kwargs={"device": "cpu"})
18
+ vectordb = Chroma.from_documents(chunks, embeddings, persist_directory="chroma_db")
19
+ return vectordb.as_retriever(search_kwargs={"k": 5})
20
 
21
+ retriever = init_retriever()
 
 
22
 
23
+ # Arabic QA pipeline (extractive)
24
  qa_pipeline = pipeline(
25
  "question-answering",
26
+ model="ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA",
27
+ tokenizer="ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA",
28
  device=-1
29
  )
30
 
31
+ def answer(question: str) -> str:
 
 
 
 
32
  docs = retriever.get_relevant_documents(question)
33
+ context = "\n\n".join(d.page_content for d in docs)
34
+ out = qa_pipeline(question=question, context=context)
35
+ return out.get("answer", "عفواً، لم أجد إجابة واضحة.")