hopefully more stable and more comfortable
Browse files- PDF Parser - Sevenof9_v7d.py +826 -0
PDF Parser - Sevenof9_v7d.py
ADDED
@@ -0,0 +1,826 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import time
|
4 |
+
import json
|
5 |
+
import wx
|
6 |
+
import re
|
7 |
+
import platform
|
8 |
+
import subprocess
|
9 |
+
import threading
|
10 |
+
import concurrent.futures
|
11 |
+
import multiprocessing
|
12 |
+
from concurrent.futures import ProcessPoolExecutor
|
13 |
+
import pdfplumber
|
14 |
+
import psutil
|
15 |
+
import logging
|
16 |
+
from pdfminer.pdfparser import PDFParser, PDFSyntaxError
|
17 |
+
from pdfminer.pdfdocument import PDFDocument, PDFEncryptionError, PDFPasswordIncorrect
|
18 |
+
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
|
19 |
+
from pdfminer.pdfinterp import PDFResourceManager
|
20 |
+
|
21 |
+
|
22 |
+
# -------------------- Konfiguration --------------------
|
23 |
+
PARALLEL_THRESHOLD = 14
|
24 |
+
|
25 |
+
TEXT_EXTRACTION_SETTINGS = {
|
26 |
+
"x_tolerance": 1.5,
|
27 |
+
"y_tolerance": 2.5,
|
28 |
+
"keep_blank_chars": False,
|
29 |
+
"use_text_flow": False,
|
30 |
+
}
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
# GUi update intervall
|
35 |
+
def throttle_callback(callback, interval_ms=1):
|
36 |
+
last_called = 0
|
37 |
+
|
38 |
+
def wrapper(status):
|
39 |
+
nonlocal last_called
|
40 |
+
now = time.time() * 1000 # Zeit in ms
|
41 |
+
if now - last_called >= interval_ms:
|
42 |
+
last_called = now
|
43 |
+
callback(status)
|
44 |
+
return wrapper
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
# Function to suppress PDFMiner logging, reducing verbosity
|
49 |
+
def suppress_pdfminer_logging():
|
50 |
+
for logger_name in [
|
51 |
+
"pdfminer", # Various pdfminer modules to suppress logging from
|
52 |
+
"pdfminer.pdfparser",
|
53 |
+
"pdfminer.pdfdocument",
|
54 |
+
"pdfminer.pdfpage",
|
55 |
+
"pdfminer.converter",
|
56 |
+
"pdfminer.layout",
|
57 |
+
"pdfminer.cmapdb",
|
58 |
+
"pdfminer.utils"
|
59 |
+
]:
|
60 |
+
logging.getLogger(logger_name).setLevel(logging.ERROR) # Set logging level to ERROR to suppress lower levels
|
61 |
+
|
62 |
+
|
63 |
+
EUROPEAN_PRINTABLES_PATTERN = re.compile(r"[^\u0000-\uFFFF]", re.DOTALL)
|
64 |
+
CID_PATTERN = re.compile(r"\(cid:\d+\)")
|
65 |
+
|
66 |
+
def clean_cell_text(text):
|
67 |
+
if not isinstance(text, str):
|
68 |
+
return ""
|
69 |
+
text = text.replace("-\n", "").replace("\n", " ")
|
70 |
+
text = CID_PATTERN.sub("", text)
|
71 |
+
return EUROPEAN_PRINTABLES_PATTERN.sub("", text)
|
72 |
+
|
73 |
+
def clamp_bbox(bbox, page_width, page_height, p=3):
|
74 |
+
x0, top, x1, bottom = bbox
|
75 |
+
x0 = max(0, min(x0, page_width))
|
76 |
+
x1 = max(0, min(x1, page_width))
|
77 |
+
top = max(0, min(top, page_height))
|
78 |
+
bottom = max(0, min(bottom, page_height))
|
79 |
+
return round(x0, p), round(top, p), round(x1, p), round(bottom, p)
|
80 |
+
|
81 |
+
def get_physical_cores():
|
82 |
+
count = psutil.cpu_count(logical=False)
|
83 |
+
return max(1, count if count else 1) # fallback = 1
|
84 |
+
cores = get_physical_cores()
|
85 |
+
|
86 |
+
|
87 |
+
def is_valid_cell(cell):
|
88 |
+
"""Prüft, ob eine Zelle mehr als nur Leerzeichen oder ein einzelnes Zeichen enthält."""
|
89 |
+
if cell is None:
|
90 |
+
return False
|
91 |
+
content = str(cell).strip()
|
92 |
+
return len(content) > 1
|
93 |
+
|
94 |
+
|
95 |
+
def block_area(block):
|
96 |
+
x0 = min(w["x0"] for w in block)
|
97 |
+
x1 = max(w["x1"] for w in block)
|
98 |
+
top = min(w["top"] for w in block)
|
99 |
+
bottom = max(w["bottom"] for w in block)
|
100 |
+
return (x1 - x0) * (bottom - top)
|
101 |
+
|
102 |
+
|
103 |
+
suppress_pdfminer_logging()
|
104 |
+
|
105 |
+
# -------------------- Status-Tracking --------------------
|
106 |
+
class StatusTracker:
|
107 |
+
def __init__(self, total_pages):
|
108 |
+
self.start_time = time.time()
|
109 |
+
self.total_pages = total_pages
|
110 |
+
self.processed_pages = 0
|
111 |
+
|
112 |
+
def update(self, n=1):
|
113 |
+
self.processed_pages += n
|
114 |
+
|
115 |
+
def get_status(self):
|
116 |
+
elapsed = time.time() - self.start_time
|
117 |
+
pages_per_sec = round(self.processed_pages / elapsed) if elapsed > 0 else 0
|
118 |
+
remaining_pages = self.total_pages - self.processed_pages
|
119 |
+
est_time = (remaining_pages / pages_per_sec) / 60 if pages_per_sec > 0 else float('inf')
|
120 |
+
return {
|
121 |
+
"processed_pages": self.processed_pages,
|
122 |
+
"total_pages": self.total_pages,
|
123 |
+
"pages_per_sec": pages_per_sec,
|
124 |
+
"elapsed_time": round(elapsed / 60, 1),
|
125 |
+
"est_time": round(est_time, 1)
|
126 |
+
}
|
127 |
+
|
128 |
+
|
129 |
+
# -------------------- PDF Verarbeitung --------------------
|
130 |
+
def process_page_worker(args):
|
131 |
+
suppress_pdfminer_logging()
|
132 |
+
try:
|
133 |
+
page_number, path = args
|
134 |
+
with pdfplumber.open(path) as pdf:
|
135 |
+
page = pdf.pages[page_number]
|
136 |
+
width, height = page.width, page.height
|
137 |
+
margin_x, margin_y = width * 0.04, height * 0.04
|
138 |
+
|
139 |
+
cropped_page = page.crop((margin_x, margin_y, width - margin_x, height - margin_y))
|
140 |
+
table_bboxes = [clamp_bbox(t.bbox, width, height) for t in cropped_page.find_tables()]
|
141 |
+
extracted_tables = cropped_page.extract_tables({"text_x_tolerance": 1.5})
|
142 |
+
tables_json = []
|
143 |
+
|
144 |
+
for raw_table in extracted_tables:
|
145 |
+
if not raw_table or len(raw_table) < 2:
|
146 |
+
continue # Weniger als 2 Zeilen
|
147 |
+
|
148 |
+
# Prüfe auf mindestens 2 Spalten
|
149 |
+
if all(len(row) < 2 for row in raw_table if row):
|
150 |
+
continue
|
151 |
+
|
152 |
+
# Leere oder fast leere Tabellen (nur Leerzeichen oder 1 Zeichen pro Zelle) ausschließen
|
153 |
+
if all(all(not is_valid_cell(cell) for cell in row) for row in raw_table):
|
154 |
+
continue
|
155 |
+
|
156 |
+
cleaned_table = [[clean_cell_text(c) for c in row] for row in raw_table]
|
157 |
+
header_row = cleaned_table[0]
|
158 |
+
is_corner_empty = header_row[0].strip() == ""
|
159 |
+
|
160 |
+
if is_corner_empty:
|
161 |
+
col_headers = cleaned_table[0][1:]
|
162 |
+
row_headers = [row[0] for row in cleaned_table[1:]]
|
163 |
+
data_rows = cleaned_table[1:]
|
164 |
+
|
165 |
+
table_data = {}
|
166 |
+
for row_header, row in zip(row_headers, data_rows):
|
167 |
+
row_dict = {}
|
168 |
+
for col_header, cell in zip(col_headers, row[1:]):
|
169 |
+
row_dict[col_header] = cell
|
170 |
+
table_data[row_header] = row_dict
|
171 |
+
else:
|
172 |
+
headers = header_row
|
173 |
+
data_rows = cleaned_table[1:]
|
174 |
+
table_data = []
|
175 |
+
for row in data_rows:
|
176 |
+
if len(row) == len(headers):
|
177 |
+
table_data.append(dict(zip(headers, row)))
|
178 |
+
|
179 |
+
tables_json.append(json.dumps(table_data, indent=1, ensure_ascii=False))
|
180 |
+
|
181 |
+
|
182 |
+
words = []
|
183 |
+
for w in cropped_page.extract_words(**TEXT_EXTRACTION_SETTINGS):
|
184 |
+
x0, top = float(w["x0"]), float(w["top"])
|
185 |
+
if any(bx0 <= x0 <= bx2 and by0 <= top <= by3 for bx0, by0, bx2, by3 in table_bboxes):
|
186 |
+
continue
|
187 |
+
if EUROPEAN_PRINTABLES_PATTERN.search(w["text"]):
|
188 |
+
continue
|
189 |
+
words.append(w)
|
190 |
+
|
191 |
+
def is_bold(fontname: str) -> bool:
|
192 |
+
fontname = fontname.lower()
|
193 |
+
return "bold" in fontname or "bd" in fontname or "black" in fontname
|
194 |
+
|
195 |
+
word_info = []
|
196 |
+
font_sizes = []
|
197 |
+
for w in words:
|
198 |
+
x0 = float(w["x0"])
|
199 |
+
x1 = float(w["x1"])
|
200 |
+
top = float(w["top"])
|
201 |
+
bottom = float(w["bottom"])
|
202 |
+
text = w["text"]
|
203 |
+
|
204 |
+
chars = [c for c in page.chars if x0 <= float(c["x0"]) <= x1 and top <= float(c["top"]) <= bottom]
|
205 |
+
sizes = [float(c.get("size", 0)) for c in chars if c.get("text", "").strip()]
|
206 |
+
fonts = [c.get("fontname", "") for c in chars]
|
207 |
+
bold_flags = [is_bold(c.get("fontname", "")) for c in chars]
|
208 |
+
|
209 |
+
font_size = max(sizes) if sizes else 0
|
210 |
+
font_sizes.append(font_size)
|
211 |
+
font_name = fonts[0] if fonts else "Unknown"
|
212 |
+
bold_flag = any(bold_flags)
|
213 |
+
|
214 |
+
word_info.append({
|
215 |
+
"text": text,
|
216 |
+
"top": round(top, 1),
|
217 |
+
"bottom": round(bottom, 1),
|
218 |
+
"font_size": font_size,
|
219 |
+
"font_name": font_name,
|
220 |
+
"bold_flag": bold_flag,
|
221 |
+
"x0": round(x0, 1),
|
222 |
+
"x1": round(x1, 1),
|
223 |
+
})
|
224 |
+
|
225 |
+
|
226 |
+
|
227 |
+
avg_fontsize = sum(font_sizes) / len(font_sizes) if font_sizes else 0
|
228 |
+
|
229 |
+
# Abstandsschwellen
|
230 |
+
MAX_DIST_X = 9
|
231 |
+
MAX_DIST_Y = 10
|
232 |
+
|
233 |
+
def are_words_close(w1, w2):
|
234 |
+
# Prüfe, ob Wörter räumlich nah beieinander liegen
|
235 |
+
dx = max(0, max(w1["x0"], w2["x0"]) - min(w1["x1"], w2["x1"]))
|
236 |
+
dy = max(0, max(w1["top"], w2["top"]) - min(w1["bottom"], w2["bottom"]))
|
237 |
+
return dx <= MAX_DIST_X and dy <= MAX_DIST_Y
|
238 |
+
|
239 |
+
def group_into_blocks(words):
|
240 |
+
blocks = []
|
241 |
+
unvisited = set(range(len(words)))
|
242 |
+
while unvisited:
|
243 |
+
idx = unvisited.pop()
|
244 |
+
block = {idx}
|
245 |
+
to_visit = {idx}
|
246 |
+
while to_visit:
|
247 |
+
current = to_visit.pop()
|
248 |
+
for other in list(unvisited):
|
249 |
+
if are_words_close(words[current], words[other]):
|
250 |
+
block.add(other)
|
251 |
+
to_visit.add(other)
|
252 |
+
unvisited.remove(other)
|
253 |
+
blocks.append([words[i] for i in block])
|
254 |
+
return blocks
|
255 |
+
|
256 |
+
def group_block_into_lines(block, line_tolerance=2.5):
|
257 |
+
# Gruppiere Wörter innerhalb eines Blocks in Zeilen (nach Y-Koordinate)
|
258 |
+
sorted_words = sorted(block, key=lambda w: w["top"])
|
259 |
+
lines = []
|
260 |
+
#lines = [sorted(block, key=lambda w: w["x0"])]
|
261 |
+
current_line = [sorted_words[0]]
|
262 |
+
current_top = sorted_words[0]["top"]
|
263 |
+
|
264 |
+
for word in sorted_words[1:]:
|
265 |
+
if abs(word["top"] - current_top) <= line_tolerance:
|
266 |
+
current_line.append(word)
|
267 |
+
else:
|
268 |
+
lines.append(sorted(current_line, key=lambda w: w["x0"]))
|
269 |
+
current_line = [word]
|
270 |
+
current_top = word["top"]
|
271 |
+
if current_line:
|
272 |
+
lines.append(sorted(current_line, key=lambda w: w["x0"]))
|
273 |
+
return lines
|
274 |
+
|
275 |
+
|
276 |
+
blocks = group_into_blocks(word_info)
|
277 |
+
|
278 |
+
SORT_TOLERANCE = 1 # e.g. 1 point distance
|
279 |
+
|
280 |
+
def round_to_nearest(value, tolerance):
|
281 |
+
return round(value / tolerance) * tolerance
|
282 |
+
|
283 |
+
def get_block_reference(block):
|
284 |
+
min_x0 = min(w["x0"] for w in block)
|
285 |
+
min_top = min(w["top"] for w in block)
|
286 |
+
return (
|
287 |
+
round_to_nearest(min_x0, SORT_TOLERANCE),
|
288 |
+
round_to_nearest(min_top, SORT_TOLERANCE),
|
289 |
+
)
|
290 |
+
|
291 |
+
# Sort blocks first by x0, then by top (row beginning)
|
292 |
+
sorted_blocks = sorted(blocks, key=get_block_reference)
|
293 |
+
|
294 |
+
'''
|
295 |
+
# Visualisierung: Blocks als Rechtecke zeichnen
|
296 |
+
im = page.to_image(resolution=150) # ggf. Auflösung anpassen
|
297 |
+
|
298 |
+
for block in blocks:
|
299 |
+
# Grenzen berechnen
|
300 |
+
x0 = min(w["x0"] for w in block)
|
301 |
+
top = min(w["top"] for w in block)
|
302 |
+
x1 = max(w["x1"] for w in block)
|
303 |
+
bottom = max(w["bottom"] for w in block)
|
304 |
+
|
305 |
+
# Rechteck zeichnen (blauer Rahmen, Dicke 1)
|
306 |
+
im.draw_rect((x0, top, x1, bottom), stroke="blue", stroke_width=1)
|
307 |
+
|
308 |
+
# Bild speichern – Dateiname z. B. mit Seitenzahl
|
309 |
+
im.save(f"page_{page_number + 1}_blocks.png")
|
310 |
+
'''
|
311 |
+
|
312 |
+
output_lines = []
|
313 |
+
output_lines.append(f"\nPage {page_number + 1}, Seite {page_number + 1}, Página {page_number + 1}\n") # Seitenzahl
|
314 |
+
|
315 |
+
for block_idx, block in enumerate(sorted_blocks, 1):
|
316 |
+
lines = group_block_into_lines(block)
|
317 |
+
|
318 |
+
chapter_hits = 0
|
319 |
+
important_hits = 0
|
320 |
+
block_label = None # Initialisierung hier
|
321 |
+
|
322 |
+
# Regel 1: Nur Wörter mit mehr als 3 Zeichen und keine reinen Zahlen
|
323 |
+
for w in block:
|
324 |
+
text = w["text"]
|
325 |
+
if len(text) <= 5 or text.isdigit():
|
326 |
+
continue # Regel 1 – alle anderen Regeln überspringen
|
327 |
+
|
328 |
+
size_ratio = w["font_size"] / avg_fontsize if avg_fontsize else 0
|
329 |
+
bold_flag = w["bold_flag"]
|
330 |
+
|
331 |
+
# Regel 2 – Vorrangig
|
332 |
+
if size_ratio >= 1.15:
|
333 |
+
chapter_hits += 1
|
334 |
+
# Regel 3 – Wenn Regel 2 nicht greift
|
335 |
+
elif bold_flag and size_ratio >= 1:
|
336 |
+
important_hits += 1
|
337 |
+
|
338 |
+
total_hits = chapter_hits + important_hits
|
339 |
+
|
340 |
+
# Regel 4 – Entscheidung auf Basis der Anzahl Treffer
|
341 |
+
if total_hits > 1:
|
342 |
+
block_label = "IMPORTANT"
|
343 |
+
elif total_hits == 1:
|
344 |
+
if chapter_hits == 1:
|
345 |
+
block_label = "CHAPTER"
|
346 |
+
elif important_hits == 1:
|
347 |
+
block_label = "IMPORTANT"
|
348 |
+
|
349 |
+
output_lines.append("") # Leerzeile vor Block
|
350 |
+
|
351 |
+
for line_idx, line in enumerate(lines):
|
352 |
+
line_text = " ".join(w["text"] for w in line)
|
353 |
+
if line_idx == 0 and block_label:
|
354 |
+
line_text = f"[{block_label}] {line_text}"
|
355 |
+
output_lines.append(line_text)
|
356 |
+
|
357 |
+
|
358 |
+
|
359 |
+
# Tabellen anhängen (wie gehabt)
|
360 |
+
for idx, tbl in enumerate(tables_json, 1):
|
361 |
+
output_lines.append(f'"table {idx}":\n{tbl}')
|
362 |
+
|
363 |
+
return page_number, "\n".join(output_lines)
|
364 |
+
|
365 |
+
|
366 |
+
except Exception as e:
|
367 |
+
msg = str(e).strip() or f"{type(e).__name__} (no message)"
|
368 |
+
return args[0], f"[ERROR] Seite {args[0]+1}: {msg}"
|
369 |
+
|
370 |
+
|
371 |
+
|
372 |
+
def run_serial(path, page_number, tracker=None, progress_callback=None, stop_flag=None):
|
373 |
+
results = []
|
374 |
+
for i in range(page_number):
|
375 |
+
if stop_flag and stop_flag.is_set():
|
376 |
+
break
|
377 |
+
result = process_page_worker((i, path,))
|
378 |
+
results.append(result)
|
379 |
+
if tracker is not None:
|
380 |
+
tracker.update()
|
381 |
+
if progress_callback and tracker is not None:
|
382 |
+
report_status(tracker, progress_callback)
|
383 |
+
return results
|
384 |
+
|
385 |
+
|
386 |
+
|
387 |
+
|
388 |
+
def run_parallel(path, page_number, tracker=None, progress_callback=None, stop_flag=None):
|
389 |
+
args = [(i, path) for i in range(page_number)] # stop_flag entfernt
|
390 |
+
results = [None] * page_number
|
391 |
+
|
392 |
+
def callback(result):
|
393 |
+
if result is None:
|
394 |
+
return
|
395 |
+
page, _ = result
|
396 |
+
results[page] = result
|
397 |
+
if tracker is not None:
|
398 |
+
tracker.update()
|
399 |
+
if progress_callback and tracker is not None:
|
400 |
+
report_status(tracker, progress_callback)
|
401 |
+
|
402 |
+
with concurrent.futures.ProcessPoolExecutor(
|
403 |
+
max_workers=min(page_number, get_physical_cores())
|
404 |
+
) as executor:
|
405 |
+
futures = {executor.submit(process_page_worker, arg): arg for arg in args}
|
406 |
+
for future in concurrent.futures.as_completed(futures):
|
407 |
+
# stop_flag nicht hier prüfen, sondern im Hauptthread
|
408 |
+
callback(future.result())
|
409 |
+
|
410 |
+
return [r for r in results if r]
|
411 |
+
|
412 |
+
|
413 |
+
def report_status(tracker, progress_callback=None):
|
414 |
+
status = tracker.get_status()
|
415 |
+
if progress_callback:
|
416 |
+
progress_callback(status)
|
417 |
+
else:
|
418 |
+
print(f"[STATUS] {status['processed_pages']}/{status['total_pages']} Seiten "
|
419 |
+
f"({status['pages_per_sec']:} Seiten/s, "
|
420 |
+
f"Elapsed: {status['elapsed_time']} Sek.)"
|
421 |
+
f"Est Time: {status['est_time']} Sek.)")
|
422 |
+
|
423 |
+
|
424 |
+
def save_pdf(path, page_number, tracker=None, parallel=False, progress_callback=None, stop_flag=None):
|
425 |
+
if stop_flag and stop_flag.is_set():
|
426 |
+
return 0
|
427 |
+
|
428 |
+
if parallel:
|
429 |
+
results = run_parallel(path, page_number, tracker, progress_callback, stop_flag)
|
430 |
+
else:
|
431 |
+
results = run_serial(path, page_number, tracker, progress_callback, stop_flag)
|
432 |
+
|
433 |
+
results = [r for r in results if r] # Filter None (bei Stop)
|
434 |
+
|
435 |
+
results.sort(key=lambda x: x[0])
|
436 |
+
text_output = "\n".join(text for _, text in results)
|
437 |
+
|
438 |
+
out_path = os.path.splitext(path)[0] + ".txt"
|
439 |
+
with open(out_path, "w", encoding="utf-8", errors="ignore") as f:
|
440 |
+
f.write(text_output)
|
441 |
+
|
442 |
+
return page_number
|
443 |
+
|
444 |
+
|
445 |
+
|
446 |
+
def _process_single_pdf(path):
|
447 |
+
suppress_pdfminer_logging()
|
448 |
+
try:
|
449 |
+
with open(path, "rb") as f:
|
450 |
+
parser = PDFParser(f)
|
451 |
+
document = PDFDocument(parser)
|
452 |
+
|
453 |
+
if not document.is_extractable:
|
454 |
+
raise PDFTextExtractionNotAllowed("Text-Extraktion nicht erlaubt")
|
455 |
+
|
456 |
+
pages = list(PDFPage.create_pages(document))
|
457 |
+
return (path, len(pages), None)
|
458 |
+
|
459 |
+
except (PDFEncryptionError, PDFPasswordIncorrect) as e:
|
460 |
+
return (path, 0, f"[ERROR] Datei passwortgeschützt: {path} ({type(e).__name__}: {e})\n")
|
461 |
+
except PDFSyntaxError as e:
|
462 |
+
return (path, 0, f"[ERROR] Ungültige PDF-Syntax: {path} ({type(e).__name__}: {e})\n")
|
463 |
+
except PDFTextExtractionNotAllowed as e:
|
464 |
+
return (path, 0, f"[ERROR] Text-Extraktion nicht erlaubt: {path} ({type(e).__name__}: {e})\n")
|
465 |
+
except Exception as e:
|
466 |
+
return (path, 0, f"[ERROR] Fehler bei Datei {path}: {type(e).__name__}: {e}\n")
|
467 |
+
|
468 |
+
def get_total_pages(pdf_files, error_callback=None, progress_callback=None):
|
469 |
+
suppress_pdfminer_logging()
|
470 |
+
total = 0
|
471 |
+
page_info = []
|
472 |
+
|
473 |
+
def handle_result(path, count, error):
|
474 |
+
nonlocal total
|
475 |
+
if error:
|
476 |
+
if error_callback:
|
477 |
+
error_callback(error)
|
478 |
+
else:
|
479 |
+
print(error, end="")
|
480 |
+
else:
|
481 |
+
page_info.append((path, count))
|
482 |
+
total += count
|
483 |
+
if progress_callback:
|
484 |
+
progress_callback(total) # Rückmeldung an GUI
|
485 |
+
|
486 |
+
if len(pdf_files) > 14:
|
487 |
+
with concurrent.futures.ProcessPoolExecutor(max_workers=cores) as executor:
|
488 |
+
results = executor.map(_process_single_pdf, pdf_files)
|
489 |
+
for path, count, error in results:
|
490 |
+
handle_result(path, count, error)
|
491 |
+
else:
|
492 |
+
for path in pdf_files:
|
493 |
+
path, count, error = _process_single_pdf(path)
|
494 |
+
handle_result(path, count, error)
|
495 |
+
|
496 |
+
return page_info, total
|
497 |
+
|
498 |
+
|
499 |
+
|
500 |
+
|
501 |
+
# -------------------- GUI --------------------
|
502 |
+
class FileManager(wx.Frame):
|
503 |
+
def __init__(self, parent):
|
504 |
+
super().__init__(parent, title="PDF Parser - Sevenof9_v7d", size=(1000, 800))
|
505 |
+
self.files = []
|
506 |
+
self.InitUI()
|
507 |
+
self.stop_flag = threading.Event()
|
508 |
+
|
509 |
+
def InitUI(self):
|
510 |
+
panel = wx.Panel(self)
|
511 |
+
vbox = wx.BoxSizer(wx.VERTICAL)
|
512 |
+
|
513 |
+
hbox_lbl1 = wx.BoxSizer(wx.HORIZONTAL)
|
514 |
+
|
515 |
+
lbl1 = wx.StaticText(panel, label="Filed PDF files: (with right mouse you can remove and open)")
|
516 |
+
hbox_lbl1.Add(lbl1, flag=wx.ALIGN_CENTER_VERTICAL | wx.LEFT, border=10)
|
517 |
+
|
518 |
+
hbox_lbl1.AddStretchSpacer() # <== schiebt den Button ganz nach rechts
|
519 |
+
|
520 |
+
help_btn = wx.Button(panel, label="? HELP ?", size=(60, 25))
|
521 |
+
help_btn.Bind(wx.EVT_BUTTON, self.ShowHelpText)
|
522 |
+
hbox_lbl1.Add(help_btn, flag=wx.RIGHT, border=10)
|
523 |
+
|
524 |
+
vbox.Add(hbox_lbl1, flag=wx.EXPAND | wx.TOP, border=10)
|
525 |
+
|
526 |
+
|
527 |
+
self.listbox = wx.ListBox(panel, style=wx.LB_EXTENDED)
|
528 |
+
self.listbox.Bind(wx.EVT_RIGHT_DOWN, self.OnRightClick)
|
529 |
+
self.listbox.Bind(wx.EVT_LISTBOX, self.ShowText)
|
530 |
+
vbox.Add(self.listbox, proportion=1, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
|
531 |
+
|
532 |
+
self.popup_menu = wx.Menu()
|
533 |
+
self.popup_menu.Append(1, "Remove selected")
|
534 |
+
self.popup_menu.Append(2, "Open in default PDF app")
|
535 |
+
self.popup_menu.Append(3, "Copy File Location")
|
536 |
+
self.popup_menu.Append(4, "Open File Location")
|
537 |
+
self.Bind(wx.EVT_MENU, self.RemoveFile, id=1)
|
538 |
+
self.Bind(wx.EVT_MENU, self.OpenPDF, id=2)
|
539 |
+
self.Bind(wx.EVT_MENU, self.CopyFileLocation, id=3)
|
540 |
+
self.Bind(wx.EVT_MENU, self.OpenFileLocation, id=4)
|
541 |
+
|
542 |
+
|
543 |
+
btn_panel = wx.Panel(panel)
|
544 |
+
btn_sizer = wx.BoxSizer(wx.HORIZONTAL)
|
545 |
+
for label, handler in [
|
546 |
+
("Add Folder", self.AddFolder),
|
547 |
+
("Select Files", self.AddFile),
|
548 |
+
("Remove Selected", self.RemoveFile),
|
549 |
+
("Remove All", self.RemoveAll),
|
550 |
+
("Stop Parser", self.StopParser),
|
551 |
+
("Start Parser", self.StartParser)
|
552 |
+
]:
|
553 |
+
btn = wx.Button(btn_panel, label=label)
|
554 |
+
btn.Bind(wx.EVT_BUTTON, handler)
|
555 |
+
if label == "Start Parser":
|
556 |
+
self.start_btn = btn # <-- Referenz merken
|
557 |
+
btn_sizer.Add(btn, proportion=1, flag=wx.ALL, border=5)
|
558 |
+
btn_panel.SetSizer(btn_sizer)
|
559 |
+
vbox.Add(btn_panel, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
|
560 |
+
|
561 |
+
|
562 |
+
lbl2 = wx.StaticText(panel, label="Text Frame: (choose PDF to see converted text)")
|
563 |
+
vbox.Add(lbl2, flag=wx.LEFT, border=10)
|
564 |
+
|
565 |
+
self.text_ctrl = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY)
|
566 |
+
self.ShowHelpText(None)
|
567 |
+
vbox.Add(self.text_ctrl, proportion=1, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
|
568 |
+
|
569 |
+
# Statusanzeige
|
570 |
+
stat_grid = wx.FlexGridSizer(1, 5, 5, 55)
|
571 |
+
self.lbl_processed_pages = wx.StaticText(panel, label="Processed pages: 0")
|
572 |
+
self.lbl_total_pages = wx.StaticText(panel, label="Total pages: 0")
|
573 |
+
self.lbl_pages_per_sec = wx.StaticText(panel, label="Pages/sec: 0")
|
574 |
+
self.lbl_est_time = wx.StaticText(panel, label="Estimated time (min): 0.0")
|
575 |
+
self.lbl_elapsed_time = wx.StaticText(panel, label="Elapsed time: 0.0")
|
576 |
+
|
577 |
+
for lbl in [self.lbl_processed_pages, self.lbl_total_pages, self.lbl_pages_per_sec, self.lbl_est_time, self.lbl_elapsed_time]:
|
578 |
+
stat_grid.Add(lbl)
|
579 |
+
vbox.Add(stat_grid, flag=wx.LEFT | wx.TOP, border=10)
|
580 |
+
|
581 |
+
self.prog_ctrl = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY)
|
582 |
+
vbox.Add(self.prog_ctrl, proportion=1, flag=wx.EXPAND | wx.ALL, border=10)
|
583 |
+
|
584 |
+
panel.SetSizer(vbox)
|
585 |
+
|
586 |
+
|
587 |
+
def ShowHelpText(self, event):
|
588 |
+
help_text = (
|
589 |
+
" This is a small help\n\n"
|
590 |
+
" • PRE ALPHA version (for ever) •\n"
|
591 |
+
"• The generated TXT file has the same name as the PDF file\n"
|
592 |
+
"• The TXT file is created in the same directory as the PDF\n"
|
593 |
+
"• Older TXT files will be overwritten without prompting\n"
|
594 |
+
"• When selecting a folder, subfolders are also selected\n"
|
595 |
+
"If:\n"
|
596 |
+
"[INFO] File completed: TEST.pdf (X pages)!\n"
|
597 |
+
"[INFO] Processing completed\n"
|
598 |
+
"-> This only means that all pages have been processed; it does not mean that the quality is good.\n"
|
599 |
+
"• An attempt is made to reproduce the layout of the page in columns from left to right and in blocks from top to bottom\n"
|
600 |
+
"• An attempt is made to detect regular tables with lines; headers (top or top and left) are assigned to the cells and stored in JSON format in the text file\n"
|
601 |
+
"\n"
|
602 |
+
"Stop function becomes effective only after the currently processed file\n"
|
603 |
+
"When processing large amounts of data, the following should be noted:\n"
|
604 |
+
"First, all PDFs are opened once to determine the number of pages:\n"
|
605 |
+
"Then, all small PDFs are processed in parallel:\n"
|
606 |
+
"Then, each large PDF is processed page by page in parallel:\n"
|
607 |
+
)
|
608 |
+
self.text_ctrl.SetValue(help_text)
|
609 |
+
|
610 |
+
|
611 |
+
def AddFolder(self, event):
|
612 |
+
dlg = wx.DirDialog(self, "Select Folder")
|
613 |
+
if dlg.ShowModal() == wx.ID_OK:
|
614 |
+
for root, _, files in os.walk(dlg.GetPath()):
|
615 |
+
for f in files:
|
616 |
+
if f.lower().endswith(".pdf"):
|
617 |
+
path = os.path.normpath(os.path.join(root, f))
|
618 |
+
if path not in self.files:
|
619 |
+
self.files.append(path)
|
620 |
+
self.listbox.Append(path)
|
621 |
+
dlg.Destroy()
|
622 |
+
|
623 |
+
def AddFile(self, event):
|
624 |
+
with wx.FileDialog(self, "Select PDF Files", wildcard="PDF files (*.pdf)|*.pdf",
|
625 |
+
style=wx.FD_OPEN | wx.FD_MULTIPLE) as dlg:
|
626 |
+
if dlg.ShowModal() == wx.ID_OK:
|
627 |
+
for path in dlg.GetPaths():
|
628 |
+
if path not in self.files:
|
629 |
+
self.files.append(path)
|
630 |
+
self.listbox.Append(path)
|
631 |
+
|
632 |
+
def RemoveFile(self, event):
|
633 |
+
for i in reversed(self.listbox.GetSelections()):
|
634 |
+
self.listbox.Delete(i)
|
635 |
+
del self.files[i]
|
636 |
+
self.text_ctrl.Clear()
|
637 |
+
|
638 |
+
def RemoveAll(self, event):
|
639 |
+
self.listbox.Clear()
|
640 |
+
self.files.clear()
|
641 |
+
self.text_ctrl.Clear()
|
642 |
+
|
643 |
+
def OpenPDF(self, event):
|
644 |
+
i = self.listbox.GetSelections()
|
645 |
+
if i:
|
646 |
+
path = self.files[i[0]]
|
647 |
+
if platform.system() == "Windows":
|
648 |
+
os.startfile(path)
|
649 |
+
elif platform.system() == "Darwin":
|
650 |
+
subprocess.call(["open", path])
|
651 |
+
else:
|
652 |
+
subprocess.call(["xdg-open", path])
|
653 |
+
|
654 |
+
def CopyFileLocation(self, event):
|
655 |
+
sel = self.listbox.GetSelections()
|
656 |
+
if sel:
|
657 |
+
path = self.files[sel[0]]
|
658 |
+
if wx.TheClipboard.Open():
|
659 |
+
wx.TheClipboard.SetData(wx.TextDataObject(path))
|
660 |
+
wx.TheClipboard.Close()
|
661 |
+
|
662 |
+
def OpenFileLocation(self, event):
|
663 |
+
sel = self.listbox.GetSelections()
|
664 |
+
if sel:
|
665 |
+
folder = os.path.dirname(self.files[sel[0]])
|
666 |
+
if platform.system() == "Windows":
|
667 |
+
subprocess.Popen(f'explorer "{folder}"')
|
668 |
+
elif platform.system() == "Darwin":
|
669 |
+
subprocess.call(["open", folder])
|
670 |
+
else:
|
671 |
+
subprocess.call(["xdg-open", folder])
|
672 |
+
|
673 |
+
|
674 |
+
def OnRightClick(self, event):
|
675 |
+
if self.listbox.GetSelections():
|
676 |
+
self.PopupMenu(self.popup_menu, event.GetPosition())
|
677 |
+
|
678 |
+
def StartParser(self, event):
|
679 |
+
if not self.files:
|
680 |
+
wx.MessageBox("Please select files first.", "Hinweis", wx.OK | wx.ICON_INFORMATION)
|
681 |
+
wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
|
682 |
+
return
|
683 |
+
|
684 |
+
|
685 |
+
self.start_btn.Disable()
|
686 |
+
self.stop_flag.clear()
|
687 |
+
self.prog_ctrl.Clear()
|
688 |
+
|
689 |
+
def error_callback(msg):
|
690 |
+
wx.CallAfter(self.AppendProg, msg)
|
691 |
+
|
692 |
+
def update_total_pages_live(new_total):
|
693 |
+
wx.CallAfter(self.lbl_total_pages.SetLabel, f"Total pages: {new_total}")
|
694 |
+
|
695 |
+
|
696 |
+
page_info, total_pages = get_total_pages(
|
697 |
+
self.files,
|
698 |
+
error_callback=error_callback,
|
699 |
+
progress_callback=update_total_pages_live
|
700 |
+
)
|
701 |
+
|
702 |
+
if total_pages == 0:
|
703 |
+
self.AppendProg("[INFO] No pages found.\n")
|
704 |
+
wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
|
705 |
+
return
|
706 |
+
|
707 |
+
tracker = StatusTracker(total_pages)
|
708 |
+
|
709 |
+
def gui_progress_callback(status):
|
710 |
+
wx.CallAfter(self.lbl_processed_pages.SetLabel, f"Processed pages: {status['processed_pages']}")
|
711 |
+
wx.CallAfter(self.lbl_total_pages.SetLabel, f"Total pages: {status['total_pages']}")
|
712 |
+
wx.CallAfter(self.lbl_pages_per_sec.SetLabel, f"Pages/sec: {status['pages_per_sec']:}")
|
713 |
+
wx.CallAfter(self.lbl_est_time.SetLabel, f"Estimated time (min): {status['est_time']:}")
|
714 |
+
wx.CallAfter(self.lbl_elapsed_time.SetLabel, f"Elapsed time: {status['elapsed_time']}")
|
715 |
+
|
716 |
+
throttled_gui_callback = throttle_callback(gui_progress_callback, 100)
|
717 |
+
|
718 |
+
def background():
|
719 |
+
small = [p for p in page_info if p[1] <= PARALLEL_THRESHOLD]
|
720 |
+
large = [p for p in page_info if p[1] > PARALLEL_THRESHOLD]
|
721 |
+
|
722 |
+
# Verarbeite kleine Dateien je in einem eigenen Prozess
|
723 |
+
if small:
|
724 |
+
max_workers = max(1, min(len(small), get_physical_cores()))
|
725 |
+
with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
|
726 |
+
futures = {}
|
727 |
+
for path, count in small:
|
728 |
+
if self.stop_flag.is_set():
|
729 |
+
break
|
730 |
+
future = executor.submit(save_pdf, path, count, None, False, None)
|
731 |
+
futures[future] = (path, count)
|
732 |
+
|
733 |
+
for future in concurrent.futures.as_completed(futures):
|
734 |
+
if self.stop_flag.is_set():
|
735 |
+
break
|
736 |
+
path, count = futures[future]
|
737 |
+
try:
|
738 |
+
pages_processed = future.result()
|
739 |
+
tracker.update(pages_processed)
|
740 |
+
throttled_gui_callback(tracker.get_status())
|
741 |
+
wx.CallAfter(self.AppendProg, f"[INFO] File ready: {path} ({pages_processed} Seiten)\n")
|
742 |
+
except Exception as e:
|
743 |
+
wx.CallAfter(self.AppendProg, f"[ERROR] File {path}: {str(e)}\n")
|
744 |
+
|
745 |
+
# Verarbeite große Dateien Seite für Seite parallel
|
746 |
+
for path, count in large:
|
747 |
+
if self.stop_flag.is_set():
|
748 |
+
break
|
749 |
+
|
750 |
+
try:
|
751 |
+
pages_processed = save_pdf(
|
752 |
+
path,
|
753 |
+
count,
|
754 |
+
tracker,
|
755 |
+
parallel=True,
|
756 |
+
progress_callback=throttled_gui_callback,
|
757 |
+
stop_flag=self.stop_flag
|
758 |
+
)
|
759 |
+
if pages_processed:
|
760 |
+
wx.CallAfter(
|
761 |
+
self.AppendProg,
|
762 |
+
f"[INFO] File ready: {path} ({pages_processed} Seiten)\n"
|
763 |
+
)
|
764 |
+
else:
|
765 |
+
wx.CallAfter(
|
766 |
+
self.AppendProg,
|
767 |
+
f"[INFO] Stopped: {path}\n"
|
768 |
+
)
|
769 |
+
except Exception as e:
|
770 |
+
wx.CallAfter(
|
771 |
+
self.AppendProg,
|
772 |
+
f"[ERROR] File {path}: {str(e)}\n"
|
773 |
+
)
|
774 |
+
|
775 |
+
|
776 |
+
|
777 |
+
wx.CallAfter(self.AppendProg, "\n[INFO] Processing completed.\n")
|
778 |
+
wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
|
779 |
+
self.stop_flag.clear()
|
780 |
+
|
781 |
+
threading.Thread(target=background, daemon=True).start()
|
782 |
+
|
783 |
+
|
784 |
+
def StopParser(self, event):
|
785 |
+
self.stop_flag.set()
|
786 |
+
self.AppendProg("[INFO] Processing Stopped...\n")
|
787 |
+
|
788 |
+
|
789 |
+
def ShowText(self, event):
|
790 |
+
sel = self.listbox.GetSelections()
|
791 |
+
if not sel:
|
792 |
+
return
|
793 |
+
txt_path = os.path.splitext(self.files[sel[0]])[0] + ".txt"
|
794 |
+
self.text_ctrl.Clear()
|
795 |
+
if os.path.exists(txt_path):
|
796 |
+
with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
|
797 |
+
self.text_ctrl.SetValue(f.read())
|
798 |
+
else:
|
799 |
+
self.text_ctrl.SetValue("[No .txt file found]")
|
800 |
+
|
801 |
+
def AppendProg(self, text):
|
802 |
+
self.prog_ctrl.AppendText(text)
|
803 |
+
|
804 |
+
|
805 |
+
# -------------------- Einstiegspunkt --------------------
|
806 |
+
def main():
|
807 |
+
if len(sys.argv) > 1:
|
808 |
+
pdf_files = sys.argv[1:]
|
809 |
+
page_info, total_pages = get_total_pages(pdf_files)
|
810 |
+
tracker = StatusTracker(total_pages)
|
811 |
+
|
812 |
+
def cli_callback(status):
|
813 |
+
print(json.dumps(status))
|
814 |
+
|
815 |
+
for path, count in page_info:
|
816 |
+
save_pdf(path, count, tracker, parallel=(count > PARALLEL_THRESHOLD), progress_callback=cli_callback)
|
817 |
+
else:
|
818 |
+
app = wx.App(False)
|
819 |
+
frame = FileManager(None)
|
820 |
+
frame.Show()
|
821 |
+
app.MainLoop()
|
822 |
+
|
823 |
+
|
824 |
+
if __name__ == "__main__":
|
825 |
+
multiprocessing.freeze_support()
|
826 |
+
main()
|