File size: 2,966 Bytes
a1482f3 22ff133 a1482f3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from PyPDF2 import PdfReader
import spacy
# Load SpaCy and models
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
from spacy.cli import download
download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
tokenizer = T5Tokenizer.from_pretrained("./T5base_Question_Generation")
t5_model = T5ForConditionalGeneration.from_pretrained("./T5base_Question_Generation")
def extract_text_from_pdf(pdf_path):
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
if page.extract_text():
text += page.extract_text() + "\n"
return text
def split_into_sentences(text):
doc = nlp(text)
return [sent.text.strip() for sent in doc.sents if sent.text.strip()]
def create_chunks(sentences, window_size=2):
return [" ".join(sentences[i:i+window_size]) for i in range(len(sentences) - window_size + 1)]
def generate_embeddings(chunks):
return embedding_model.encode(chunks, show_progress_bar=True)
def create_faiss_index(embeddings):
dimension = embeddings[0].shape[0]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))
return index
def retrieve_relevant_chunks(query, chunks, index, top_k=30):
query_embedding = embedding_model.encode([query])
distances, indices = index.search(np.array(query_embedding), top_k)
return [chunks[i] for i in indices[0]], distances[0]
def get_questions(tag, difficulty, context, num_questions=3, max_length=150):
input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99> {context}"
features = tokenizer([input_text], return_tensors='pt')
output = t5_model.generate(
input_ids=features['input_ids'],
attention_mask=features['attention_mask'],
max_length=max_length,
num_return_sequences=num_questions,
do_sample=True,
top_p=0.95,
top_k=50
)
return [tokenizer.decode(out, skip_special_tokens=True) for out in output]
def process_pdf(pdf_file, tag, difficulty, query):
if pdf_file is None:
return "Please upload a PDF file."
text = extract_text_from_pdf(pdf_file.name)
sentences = split_into_sentences(text)
chunks = create_chunks(sentences)
embeddings = generate_embeddings(chunks)
index = create_faiss_index(embeddings)
relevant_chunks, _ = retrieve_relevant_chunks(query, chunks, index)
filtered_chunks = [chunk for chunk in relevant_chunks if len(chunk.split()) > 20][:3]
if not filtered_chunks:
return "No sufficiently long chunks found. Try another query."
context = " ".join(filtered_chunks)
questions = get_questions(tag, difficulty, context)
return "\n".join([f"Question {i+1}: {q}" for i, q in enumerate(questions)])
|