Ralqasimi commited on
Commit
e4a6d2a
·
verified ·
1 Parent(s): 788991c

Update pdf_extractor.py

Browse files
Files changed (1) hide show
  1. pdf_extractor.py +8 -6
pdf_extractor.py CHANGED
@@ -1,13 +1,15 @@
1
- from PyPDF2 import PdfReader
2
 
3
- # Function to extract text from PDFs (normal PDFs only)
4
  def extract_text_from_pdf(pdf_path):
5
- reader = PdfReader(pdf_path)
 
 
6
  text = ""
7
- for page in reader.pages:
8
- text += page.extract_text()
 
 
9
  return text.strip()
10
 
11
- # Main function to handle PDF text extraction
12
  def get_pdf_text(pdf_path):
13
  return extract_text_from_pdf(pdf_path)
 
1
+ import fitz # PyMuPDF
2
 
 
3
  def extract_text_from_pdf(pdf_path):
4
+ """
5
+ Extracts text from PDF using PyMuPDF (fitz).
6
+ """
7
  text = ""
8
+ with fitz.open(pdf_path) as pdf:
9
+ for page_num in range(len(pdf)):
10
+ page = pdf[page_num]
11
+ text += page.get_text()
12
  return text.strip()
13
 
 
14
  def get_pdf_text(pdf_path):
15
  return extract_text_from_pdf(pdf_path)