pdf2txt_parser_converter / PDF Parser - Sevenof9_v7d.py
kalle07's picture
hopefully more stable and more comfortable
8958f01 verified
import os
import sys
import time
import json
import wx
import re
import platform
import subprocess
import threading
import concurrent.futures
import multiprocessing
from concurrent.futures import ProcessPoolExecutor
import pdfplumber
import psutil
import logging
from pdfminer.pdfparser import PDFParser, PDFSyntaxError
from pdfminer.pdfdocument import PDFDocument, PDFEncryptionError, PDFPasswordIncorrect
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
# -------------------- Konfiguration --------------------
PARALLEL_THRESHOLD = 14
TEXT_EXTRACTION_SETTINGS = {
"x_tolerance": 1.5,
"y_tolerance": 2.5,
"keep_blank_chars": False,
"use_text_flow": False,
}
# GUi update intervall
def throttle_callback(callback, interval_ms=1):
last_called = 0
def wrapper(status):
nonlocal last_called
now = time.time() * 1000 # Zeit in ms
if now - last_called >= interval_ms:
last_called = now
callback(status)
return wrapper
# Function to suppress PDFMiner logging, reducing verbosity
def suppress_pdfminer_logging():
for logger_name in [
"pdfminer", # Various pdfminer modules to suppress logging from
"pdfminer.pdfparser",
"pdfminer.pdfdocument",
"pdfminer.pdfpage",
"pdfminer.converter",
"pdfminer.layout",
"pdfminer.cmapdb",
"pdfminer.utils"
]:
logging.getLogger(logger_name).setLevel(logging.ERROR) # Set logging level to ERROR to suppress lower levels
EUROPEAN_PRINTABLES_PATTERN = re.compile(r"[^\u0000-\uFFFF]", re.DOTALL)
CID_PATTERN = re.compile(r"\(cid:\d+\)")
def clean_cell_text(text):
if not isinstance(text, str):
return ""
text = text.replace("-\n", "").replace("\n", " ")
text = CID_PATTERN.sub("", text)
return EUROPEAN_PRINTABLES_PATTERN.sub("", text)
def clamp_bbox(bbox, page_width, page_height, p=3):
x0, top, x1, bottom = bbox
x0 = max(0, min(x0, page_width))
x1 = max(0, min(x1, page_width))
top = max(0, min(top, page_height))
bottom = max(0, min(bottom, page_height))
return round(x0, p), round(top, p), round(x1, p), round(bottom, p)
def get_physical_cores():
count = psutil.cpu_count(logical=False)
return max(1, count if count else 1) # fallback = 1
cores = get_physical_cores()
def is_valid_cell(cell):
"""Prüft, ob eine Zelle mehr als nur Leerzeichen oder ein einzelnes Zeichen enthält."""
if cell is None:
return False
content = str(cell).strip()
return len(content) > 1
def block_area(block):
x0 = min(w["x0"] for w in block)
x1 = max(w["x1"] for w in block)
top = min(w["top"] for w in block)
bottom = max(w["bottom"] for w in block)
return (x1 - x0) * (bottom - top)
suppress_pdfminer_logging()
# -------------------- Status-Tracking --------------------
class StatusTracker:
def __init__(self, total_pages):
self.start_time = time.time()
self.total_pages = total_pages
self.processed_pages = 0
def update(self, n=1):
self.processed_pages += n
def get_status(self):
elapsed = time.time() - self.start_time
pages_per_sec = round(self.processed_pages / elapsed) if elapsed > 0 else 0
remaining_pages = self.total_pages - self.processed_pages
est_time = (remaining_pages / pages_per_sec) / 60 if pages_per_sec > 0 else float('inf')
return {
"processed_pages": self.processed_pages,
"total_pages": self.total_pages,
"pages_per_sec": pages_per_sec,
"elapsed_time": round(elapsed / 60, 1),
"est_time": round(est_time, 1)
}
# -------------------- PDF Verarbeitung --------------------
def process_page_worker(args):
suppress_pdfminer_logging()
try:
page_number, path = args
with pdfplumber.open(path) as pdf:
page = pdf.pages[page_number]
width, height = page.width, page.height
margin_x, margin_y = width * 0.04, height * 0.04
cropped_page = page.crop((margin_x, margin_y, width - margin_x, height - margin_y))
table_bboxes = [clamp_bbox(t.bbox, width, height) for t in cropped_page.find_tables()]
extracted_tables = cropped_page.extract_tables({"text_x_tolerance": 1.5})
tables_json = []
for raw_table in extracted_tables:
if not raw_table or len(raw_table) < 2:
continue # Weniger als 2 Zeilen
# Prüfe auf mindestens 2 Spalten
if all(len(row) < 2 for row in raw_table if row):
continue
# Leere oder fast leere Tabellen (nur Leerzeichen oder 1 Zeichen pro Zelle) ausschließen
if all(all(not is_valid_cell(cell) for cell in row) for row in raw_table):
continue
cleaned_table = [[clean_cell_text(c) for c in row] for row in raw_table]
header_row = cleaned_table[0]
is_corner_empty = header_row[0].strip() == ""
if is_corner_empty:
col_headers = cleaned_table[0][1:]
row_headers = [row[0] for row in cleaned_table[1:]]
data_rows = cleaned_table[1:]
table_data = {}
for row_header, row in zip(row_headers, data_rows):
row_dict = {}
for col_header, cell in zip(col_headers, row[1:]):
row_dict[col_header] = cell
table_data[row_header] = row_dict
else:
headers = header_row
data_rows = cleaned_table[1:]
table_data = []
for row in data_rows:
if len(row) == len(headers):
table_data.append(dict(zip(headers, row)))
tables_json.append(json.dumps(table_data, indent=1, ensure_ascii=False))
words = []
for w in cropped_page.extract_words(**TEXT_EXTRACTION_SETTINGS):
x0, top = float(w["x0"]), float(w["top"])
if any(bx0 <= x0 <= bx2 and by0 <= top <= by3 for bx0, by0, bx2, by3 in table_bboxes):
continue
if EUROPEAN_PRINTABLES_PATTERN.search(w["text"]):
continue
words.append(w)
def is_bold(fontname: str) -> bool:
fontname = fontname.lower()
return "bold" in fontname or "bd" in fontname or "black" in fontname
word_info = []
font_sizes = []
for w in words:
x0 = float(w["x0"])
x1 = float(w["x1"])
top = float(w["top"])
bottom = float(w["bottom"])
text = w["text"]
chars = [c for c in page.chars if x0 <= float(c["x0"]) <= x1 and top <= float(c["top"]) <= bottom]
sizes = [float(c.get("size", 0)) for c in chars if c.get("text", "").strip()]
fonts = [c.get("fontname", "") for c in chars]
bold_flags = [is_bold(c.get("fontname", "")) for c in chars]
font_size = max(sizes) if sizes else 0
font_sizes.append(font_size)
font_name = fonts[0] if fonts else "Unknown"
bold_flag = any(bold_flags)
word_info.append({
"text": text,
"top": round(top, 1),
"bottom": round(bottom, 1),
"font_size": font_size,
"font_name": font_name,
"bold_flag": bold_flag,
"x0": round(x0, 1),
"x1": round(x1, 1),
})
avg_fontsize = sum(font_sizes) / len(font_sizes) if font_sizes else 0
# Abstandsschwellen
MAX_DIST_X = 9
MAX_DIST_Y = 10
def are_words_close(w1, w2):
# Prüfe, ob Wörter räumlich nah beieinander liegen
dx = max(0, max(w1["x0"], w2["x0"]) - min(w1["x1"], w2["x1"]))
dy = max(0, max(w1["top"], w2["top"]) - min(w1["bottom"], w2["bottom"]))
return dx <= MAX_DIST_X and dy <= MAX_DIST_Y
def group_into_blocks(words):
blocks = []
unvisited = set(range(len(words)))
while unvisited:
idx = unvisited.pop()
block = {idx}
to_visit = {idx}
while to_visit:
current = to_visit.pop()
for other in list(unvisited):
if are_words_close(words[current], words[other]):
block.add(other)
to_visit.add(other)
unvisited.remove(other)
blocks.append([words[i] for i in block])
return blocks
def group_block_into_lines(block, line_tolerance=2.5):
# Gruppiere Wörter innerhalb eines Blocks in Zeilen (nach Y-Koordinate)
sorted_words = sorted(block, key=lambda w: w["top"])
lines = []
#lines = [sorted(block, key=lambda w: w["x0"])]
current_line = [sorted_words[0]]
current_top = sorted_words[0]["top"]
for word in sorted_words[1:]:
if abs(word["top"] - current_top) <= line_tolerance:
current_line.append(word)
else:
lines.append(sorted(current_line, key=lambda w: w["x0"]))
current_line = [word]
current_top = word["top"]
if current_line:
lines.append(sorted(current_line, key=lambda w: w["x0"]))
return lines
blocks = group_into_blocks(word_info)
SORT_TOLERANCE = 1 # e.g. 1 point distance
def round_to_nearest(value, tolerance):
return round(value / tolerance) * tolerance
def get_block_reference(block):
min_x0 = min(w["x0"] for w in block)
min_top = min(w["top"] for w in block)
return (
round_to_nearest(min_x0, SORT_TOLERANCE),
round_to_nearest(min_top, SORT_TOLERANCE),
)
# Sort blocks first by x0, then by top (row beginning)
sorted_blocks = sorted(blocks, key=get_block_reference)
'''
# Visualisierung: Blocks als Rechtecke zeichnen
im = page.to_image(resolution=150) # ggf. Auflösung anpassen
for block in blocks:
# Grenzen berechnen
x0 = min(w["x0"] for w in block)
top = min(w["top"] for w in block)
x1 = max(w["x1"] for w in block)
bottom = max(w["bottom"] for w in block)
# Rechteck zeichnen (blauer Rahmen, Dicke 1)
im.draw_rect((x0, top, x1, bottom), stroke="blue", stroke_width=1)
# Bild speichern – Dateiname z. B. mit Seitenzahl
im.save(f"page_{page_number + 1}_blocks.png")
'''
output_lines = []
output_lines.append(f"\nPage {page_number + 1}, Seite {page_number + 1}, Página {page_number + 1}\n") # Seitenzahl
for block_idx, block in enumerate(sorted_blocks, 1):
lines = group_block_into_lines(block)
chapter_hits = 0
important_hits = 0
block_label = None # Initialisierung hier
# Regel 1: Nur Wörter mit mehr als 3 Zeichen und keine reinen Zahlen
for w in block:
text = w["text"]
if len(text) <= 5 or text.isdigit():
continue # Regel 1 – alle anderen Regeln überspringen
size_ratio = w["font_size"] / avg_fontsize if avg_fontsize else 0
bold_flag = w["bold_flag"]
# Regel 2 – Vorrangig
if size_ratio >= 1.15:
chapter_hits += 1
# Regel 3 – Wenn Regel 2 nicht greift
elif bold_flag and size_ratio >= 1:
important_hits += 1
total_hits = chapter_hits + important_hits
# Regel 4 – Entscheidung auf Basis der Anzahl Treffer
if total_hits > 1:
block_label = "IMPORTANT"
elif total_hits == 1:
if chapter_hits == 1:
block_label = "CHAPTER"
elif important_hits == 1:
block_label = "IMPORTANT"
output_lines.append("") # Leerzeile vor Block
for line_idx, line in enumerate(lines):
line_text = " ".join(w["text"] for w in line)
if line_idx == 0 and block_label:
line_text = f"[{block_label}] {line_text}"
output_lines.append(line_text)
# Tabellen anhängen (wie gehabt)
for idx, tbl in enumerate(tables_json, 1):
output_lines.append(f'"table {idx}":\n{tbl}')
return page_number, "\n".join(output_lines)
except Exception as e:
msg = str(e).strip() or f"{type(e).__name__} (no message)"
return args[0], f"[ERROR] Seite {args[0]+1}: {msg}"
def run_serial(path, page_number, tracker=None, progress_callback=None, stop_flag=None):
results = []
for i in range(page_number):
if stop_flag and stop_flag.is_set():
break
result = process_page_worker((i, path,))
results.append(result)
if tracker is not None:
tracker.update()
if progress_callback and tracker is not None:
report_status(tracker, progress_callback)
return results
def run_parallel(path, page_number, tracker=None, progress_callback=None, stop_flag=None):
args = [(i, path) for i in range(page_number)] # stop_flag entfernt
results = [None] * page_number
def callback(result):
if result is None:
return
page, _ = result
results[page] = result
if tracker is not None:
tracker.update()
if progress_callback and tracker is not None:
report_status(tracker, progress_callback)
with concurrent.futures.ProcessPoolExecutor(
max_workers=min(page_number, get_physical_cores())
) as executor:
futures = {executor.submit(process_page_worker, arg): arg for arg in args}
for future in concurrent.futures.as_completed(futures):
# stop_flag nicht hier prüfen, sondern im Hauptthread
callback(future.result())
return [r for r in results if r]
def report_status(tracker, progress_callback=None):
status = tracker.get_status()
if progress_callback:
progress_callback(status)
else:
print(f"[STATUS] {status['processed_pages']}/{status['total_pages']} Seiten "
f"({status['pages_per_sec']:} Seiten/s, "
f"Elapsed: {status['elapsed_time']} Sek.)"
f"Est Time: {status['est_time']} Sek.)")
def save_pdf(path, page_number, tracker=None, parallel=False, progress_callback=None, stop_flag=None):
if stop_flag and stop_flag.is_set():
return 0
if parallel:
results = run_parallel(path, page_number, tracker, progress_callback, stop_flag)
else:
results = run_serial(path, page_number, tracker, progress_callback, stop_flag)
results = [r for r in results if r] # Filter None (bei Stop)
results.sort(key=lambda x: x[0])
text_output = "\n".join(text for _, text in results)
out_path = os.path.splitext(path)[0] + ".txt"
with open(out_path, "w", encoding="utf-8", errors="ignore") as f:
f.write(text_output)
return page_number
def _process_single_pdf(path):
suppress_pdfminer_logging()
try:
with open(path, "rb") as f:
parser = PDFParser(f)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed("Text-Extraktion nicht erlaubt")
pages = list(PDFPage.create_pages(document))
return (path, len(pages), None)
except (PDFEncryptionError, PDFPasswordIncorrect) as e:
return (path, 0, f"[ERROR] Datei passwortgeschützt: {path} ({type(e).__name__}: {e})\n")
except PDFSyntaxError as e:
return (path, 0, f"[ERROR] Ungültige PDF-Syntax: {path} ({type(e).__name__}: {e})\n")
except PDFTextExtractionNotAllowed as e:
return (path, 0, f"[ERROR] Text-Extraktion nicht erlaubt: {path} ({type(e).__name__}: {e})\n")
except Exception as e:
return (path, 0, f"[ERROR] Fehler bei Datei {path}: {type(e).__name__}: {e}\n")
def get_total_pages(pdf_files, error_callback=None, progress_callback=None):
suppress_pdfminer_logging()
total = 0
page_info = []
def handle_result(path, count, error):
nonlocal total
if error:
if error_callback:
error_callback(error)
else:
print(error, end="")
else:
page_info.append((path, count))
total += count
if progress_callback:
progress_callback(total) # Rückmeldung an GUI
if len(pdf_files) > 14:
with concurrent.futures.ProcessPoolExecutor(max_workers=cores) as executor:
results = executor.map(_process_single_pdf, pdf_files)
for path, count, error in results:
handle_result(path, count, error)
else:
for path in pdf_files:
path, count, error = _process_single_pdf(path)
handle_result(path, count, error)
return page_info, total
# -------------------- GUI --------------------
class FileManager(wx.Frame):
def __init__(self, parent):
super().__init__(parent, title="PDF Parser - Sevenof9_v7d", size=(1000, 800))
self.files = []
self.InitUI()
self.stop_flag = threading.Event()
def InitUI(self):
panel = wx.Panel(self)
vbox = wx.BoxSizer(wx.VERTICAL)
hbox_lbl1 = wx.BoxSizer(wx.HORIZONTAL)
lbl1 = wx.StaticText(panel, label="Filed PDF files: (with right mouse you can remove and open)")
hbox_lbl1.Add(lbl1, flag=wx.ALIGN_CENTER_VERTICAL | wx.LEFT, border=10)
hbox_lbl1.AddStretchSpacer() # <== schiebt den Button ganz nach rechts
help_btn = wx.Button(panel, label="? HELP ?", size=(60, 25))
help_btn.Bind(wx.EVT_BUTTON, self.ShowHelpText)
hbox_lbl1.Add(help_btn, flag=wx.RIGHT, border=10)
vbox.Add(hbox_lbl1, flag=wx.EXPAND | wx.TOP, border=10)
self.listbox = wx.ListBox(panel, style=wx.LB_EXTENDED)
self.listbox.Bind(wx.EVT_RIGHT_DOWN, self.OnRightClick)
self.listbox.Bind(wx.EVT_LISTBOX, self.ShowText)
vbox.Add(self.listbox, proportion=1, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
self.popup_menu = wx.Menu()
self.popup_menu.Append(1, "Remove selected")
self.popup_menu.Append(2, "Open in default PDF app")
self.popup_menu.Append(3, "Copy File Location")
self.popup_menu.Append(4, "Open File Location")
self.Bind(wx.EVT_MENU, self.RemoveFile, id=1)
self.Bind(wx.EVT_MENU, self.OpenPDF, id=2)
self.Bind(wx.EVT_MENU, self.CopyFileLocation, id=3)
self.Bind(wx.EVT_MENU, self.OpenFileLocation, id=4)
btn_panel = wx.Panel(panel)
btn_sizer = wx.BoxSizer(wx.HORIZONTAL)
for label, handler in [
("Add Folder", self.AddFolder),
("Select Files", self.AddFile),
("Remove Selected", self.RemoveFile),
("Remove All", self.RemoveAll),
("Stop Parser", self.StopParser),
("Start Parser", self.StartParser)
]:
btn = wx.Button(btn_panel, label=label)
btn.Bind(wx.EVT_BUTTON, handler)
if label == "Start Parser":
self.start_btn = btn # <-- Referenz merken
btn_sizer.Add(btn, proportion=1, flag=wx.ALL, border=5)
btn_panel.SetSizer(btn_sizer)
vbox.Add(btn_panel, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
lbl2 = wx.StaticText(panel, label="Text Frame: (choose PDF to see converted text)")
vbox.Add(lbl2, flag=wx.LEFT, border=10)
self.text_ctrl = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY)
self.ShowHelpText(None)
vbox.Add(self.text_ctrl, proportion=1, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
# Statusanzeige
stat_grid = wx.FlexGridSizer(1, 5, 5, 55)
self.lbl_processed_pages = wx.StaticText(panel, label="Processed pages: 0")
self.lbl_total_pages = wx.StaticText(panel, label="Total pages: 0")
self.lbl_pages_per_sec = wx.StaticText(panel, label="Pages/sec: 0")
self.lbl_est_time = wx.StaticText(panel, label="Estimated time (min): 0.0")
self.lbl_elapsed_time = wx.StaticText(panel, label="Elapsed time: 0.0")
for lbl in [self.lbl_processed_pages, self.lbl_total_pages, self.lbl_pages_per_sec, self.lbl_est_time, self.lbl_elapsed_time]:
stat_grid.Add(lbl)
vbox.Add(stat_grid, flag=wx.LEFT | wx.TOP, border=10)
self.prog_ctrl = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY)
vbox.Add(self.prog_ctrl, proportion=1, flag=wx.EXPAND | wx.ALL, border=10)
panel.SetSizer(vbox)
def ShowHelpText(self, event):
help_text = (
" This is a small help\n\n"
" • PRE ALPHA version (for ever) •\n"
"• The generated TXT file has the same name as the PDF file\n"
"• The TXT file is created in the same directory as the PDF\n"
"• Older TXT files will be overwritten without prompting\n"
"• When selecting a folder, subfolders are also selected\n"
"If:\n"
"[INFO] File completed: TEST.pdf (X pages)!\n"
"[INFO] Processing completed\n"
"-> This only means that all pages have been processed; it does not mean that the quality is good.\n"
"• An attempt is made to reproduce the layout of the page in columns from left to right and in blocks from top to bottom\n"
"• An attempt is made to detect regular tables with lines; headers (top or top and left) are assigned to the cells and stored in JSON format in the text file\n"
"\n"
"Stop function becomes effective only after the currently processed file\n"
"When processing large amounts of data, the following should be noted:\n"
"First, all PDFs are opened once to determine the number of pages:\n"
"Then, all small PDFs are processed in parallel:\n"
"Then, each large PDF is processed page by page in parallel:\n"
)
self.text_ctrl.SetValue(help_text)
def AddFolder(self, event):
dlg = wx.DirDialog(self, "Select Folder")
if dlg.ShowModal() == wx.ID_OK:
for root, _, files in os.walk(dlg.GetPath()):
for f in files:
if f.lower().endswith(".pdf"):
path = os.path.normpath(os.path.join(root, f))
if path not in self.files:
self.files.append(path)
self.listbox.Append(path)
dlg.Destroy()
def AddFile(self, event):
with wx.FileDialog(self, "Select PDF Files", wildcard="PDF files (*.pdf)|*.pdf",
style=wx.FD_OPEN | wx.FD_MULTIPLE) as dlg:
if dlg.ShowModal() == wx.ID_OK:
for path in dlg.GetPaths():
if path not in self.files:
self.files.append(path)
self.listbox.Append(path)
def RemoveFile(self, event):
for i in reversed(self.listbox.GetSelections()):
self.listbox.Delete(i)
del self.files[i]
self.text_ctrl.Clear()
def RemoveAll(self, event):
self.listbox.Clear()
self.files.clear()
self.text_ctrl.Clear()
def OpenPDF(self, event):
i = self.listbox.GetSelections()
if i:
path = self.files[i[0]]
if platform.system() == "Windows":
os.startfile(path)
elif platform.system() == "Darwin":
subprocess.call(["open", path])
else:
subprocess.call(["xdg-open", path])
def CopyFileLocation(self, event):
sel = self.listbox.GetSelections()
if sel:
path = self.files[sel[0]]
if wx.TheClipboard.Open():
wx.TheClipboard.SetData(wx.TextDataObject(path))
wx.TheClipboard.Close()
def OpenFileLocation(self, event):
sel = self.listbox.GetSelections()
if sel:
folder = os.path.dirname(self.files[sel[0]])
if platform.system() == "Windows":
subprocess.Popen(f'explorer "{folder}"')
elif platform.system() == "Darwin":
subprocess.call(["open", folder])
else:
subprocess.call(["xdg-open", folder])
def OnRightClick(self, event):
if self.listbox.GetSelections():
self.PopupMenu(self.popup_menu, event.GetPosition())
def StartParser(self, event):
if not self.files:
wx.MessageBox("Please select files first.", "Hinweis", wx.OK | wx.ICON_INFORMATION)
wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
return
self.start_btn.Disable()
self.stop_flag.clear()
self.prog_ctrl.Clear()
def error_callback(msg):
wx.CallAfter(self.AppendProg, msg)
def update_total_pages_live(new_total):
wx.CallAfter(self.lbl_total_pages.SetLabel, f"Total pages: {new_total}")
page_info, total_pages = get_total_pages(
self.files,
error_callback=error_callback,
progress_callback=update_total_pages_live
)
if total_pages == 0:
self.AppendProg("[INFO] No pages found.\n")
wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
return
tracker = StatusTracker(total_pages)
def gui_progress_callback(status):
wx.CallAfter(self.lbl_processed_pages.SetLabel, f"Processed pages: {status['processed_pages']}")
wx.CallAfter(self.lbl_total_pages.SetLabel, f"Total pages: {status['total_pages']}")
wx.CallAfter(self.lbl_pages_per_sec.SetLabel, f"Pages/sec: {status['pages_per_sec']:}")
wx.CallAfter(self.lbl_est_time.SetLabel, f"Estimated time (min): {status['est_time']:}")
wx.CallAfter(self.lbl_elapsed_time.SetLabel, f"Elapsed time: {status['elapsed_time']}")
throttled_gui_callback = throttle_callback(gui_progress_callback, 100)
def background():
small = [p for p in page_info if p[1] <= PARALLEL_THRESHOLD]
large = [p for p in page_info if p[1] > PARALLEL_THRESHOLD]
# Verarbeite kleine Dateien je in einem eigenen Prozess
if small:
max_workers = max(1, min(len(small), get_physical_cores()))
with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
futures = {}
for path, count in small:
if self.stop_flag.is_set():
break
future = executor.submit(save_pdf, path, count, None, False, None)
futures[future] = (path, count)
for future in concurrent.futures.as_completed(futures):
if self.stop_flag.is_set():
break
path, count = futures[future]
try:
pages_processed = future.result()
tracker.update(pages_processed)
throttled_gui_callback(tracker.get_status())
wx.CallAfter(self.AppendProg, f"[INFO] File ready: {path} ({pages_processed} Seiten)\n")
except Exception as e:
wx.CallAfter(self.AppendProg, f"[ERROR] File {path}: {str(e)}\n")
# Verarbeite große Dateien Seite für Seite parallel
for path, count in large:
if self.stop_flag.is_set():
break
try:
pages_processed = save_pdf(
path,
count,
tracker,
parallel=True,
progress_callback=throttled_gui_callback,
stop_flag=self.stop_flag
)
if pages_processed:
wx.CallAfter(
self.AppendProg,
f"[INFO] File ready: {path} ({pages_processed} Seiten)\n"
)
else:
wx.CallAfter(
self.AppendProg,
f"[INFO] Stopped: {path}\n"
)
except Exception as e:
wx.CallAfter(
self.AppendProg,
f"[ERROR] File {path}: {str(e)}\n"
)
wx.CallAfter(self.AppendProg, "\n[INFO] Processing completed.\n")
wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
self.stop_flag.clear()
threading.Thread(target=background, daemon=True).start()
def StopParser(self, event):
self.stop_flag.set()
self.AppendProg("[INFO] Processing Stopped...\n")
def ShowText(self, event):
sel = self.listbox.GetSelections()
if not sel:
return
txt_path = os.path.splitext(self.files[sel[0]])[0] + ".txt"
self.text_ctrl.Clear()
if os.path.exists(txt_path):
with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
self.text_ctrl.SetValue(f.read())
else:
self.text_ctrl.SetValue("[No .txt file found]")
def AppendProg(self, text):
self.prog_ctrl.AppendText(text)
# -------------------- Einstiegspunkt --------------------
def main():
if len(sys.argv) > 1:
pdf_files = sys.argv[1:]
page_info, total_pages = get_total_pages(pdf_files)
tracker = StatusTracker(total_pages)
def cli_callback(status):
print(json.dumps(status))
for path, count in page_info:
save_pdf(path, count, tracker, parallel=(count > PARALLEL_THRESHOLD), progress_callback=cli_callback)
else:
app = wx.App(False)
frame = FileManager(None)
frame.Show()
app.MainLoop()
if __name__ == "__main__":
multiprocessing.freeze_support()
main()