Spaces:

Ralqasimi
/

Chatbot

Sleeping

Ralqasimi commited on Feb 7

Commit

e4a6d2a

verified ·

1 Parent(s): 788991c

Update pdf_extractor.py

Files changed (1) hide show

pdf_extractor.py CHANGED Viewed

@@ -1,13 +1,15 @@
-from PyPDF2 import PdfReader
-# Function to extract text from PDFs (normal PDFs only)
 def extract_text_from_pdf(pdf_path):
-    reader = PdfReader(pdf_path)
     text = ""
-    for page in reader.pages:
-        text += page.extract_text()
     return text.strip()
-# Main function to handle PDF text extraction
 def get_pdf_text(pdf_path):
     return extract_text_from_pdf(pdf_path)

+import fitz  # PyMuPDF
 def extract_text_from_pdf(pdf_path):
+    """
+    Extracts text from PDF using PyMuPDF (fitz).
+    """
     text = ""
+    with fitz.open(pdf_path) as pdf:
+        for page_num in range(len(pdf)):
+            page = pdf[page_num]
+            text += page.get_text()
     return text.strip()
 def get_pdf_text(pdf_path):
     return extract_text_from_pdf(pdf_path)