kalle07 commited on
Commit
8958f01
·
verified ·
1 Parent(s): 4b74ae5

hopefully more stable and more comfortable

Browse files
Files changed (1) hide show
  1. PDF Parser - Sevenof9_v7d.py +826 -0
PDF Parser - Sevenof9_v7d.py ADDED
@@ -0,0 +1,826 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ import json
5
+ import wx
6
+ import re
7
+ import platform
8
+ import subprocess
9
+ import threading
10
+ import concurrent.futures
11
+ import multiprocessing
12
+ from concurrent.futures import ProcessPoolExecutor
13
+ import pdfplumber
14
+ import psutil
15
+ import logging
16
+ from pdfminer.pdfparser import PDFParser, PDFSyntaxError
17
+ from pdfminer.pdfdocument import PDFDocument, PDFEncryptionError, PDFPasswordIncorrect
18
+ from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
19
+ from pdfminer.pdfinterp import PDFResourceManager
20
+
21
+
22
+ # -------------------- Konfiguration --------------------
23
+ PARALLEL_THRESHOLD = 14
24
+
25
+ TEXT_EXTRACTION_SETTINGS = {
26
+ "x_tolerance": 1.5,
27
+ "y_tolerance": 2.5,
28
+ "keep_blank_chars": False,
29
+ "use_text_flow": False,
30
+ }
31
+
32
+
33
+
34
+ # GUi update intervall
35
+ def throttle_callback(callback, interval_ms=1):
36
+ last_called = 0
37
+
38
+ def wrapper(status):
39
+ nonlocal last_called
40
+ now = time.time() * 1000 # Zeit in ms
41
+ if now - last_called >= interval_ms:
42
+ last_called = now
43
+ callback(status)
44
+ return wrapper
45
+
46
+
47
+
48
+ # Function to suppress PDFMiner logging, reducing verbosity
49
+ def suppress_pdfminer_logging():
50
+ for logger_name in [
51
+ "pdfminer", # Various pdfminer modules to suppress logging from
52
+ "pdfminer.pdfparser",
53
+ "pdfminer.pdfdocument",
54
+ "pdfminer.pdfpage",
55
+ "pdfminer.converter",
56
+ "pdfminer.layout",
57
+ "pdfminer.cmapdb",
58
+ "pdfminer.utils"
59
+ ]:
60
+ logging.getLogger(logger_name).setLevel(logging.ERROR) # Set logging level to ERROR to suppress lower levels
61
+
62
+
63
+ EUROPEAN_PRINTABLES_PATTERN = re.compile(r"[^\u0000-\uFFFF]", re.DOTALL)
64
+ CID_PATTERN = re.compile(r"\(cid:\d+\)")
65
+
66
+ def clean_cell_text(text):
67
+ if not isinstance(text, str):
68
+ return ""
69
+ text = text.replace("-\n", "").replace("\n", " ")
70
+ text = CID_PATTERN.sub("", text)
71
+ return EUROPEAN_PRINTABLES_PATTERN.sub("", text)
72
+
73
+ def clamp_bbox(bbox, page_width, page_height, p=3):
74
+ x0, top, x1, bottom = bbox
75
+ x0 = max(0, min(x0, page_width))
76
+ x1 = max(0, min(x1, page_width))
77
+ top = max(0, min(top, page_height))
78
+ bottom = max(0, min(bottom, page_height))
79
+ return round(x0, p), round(top, p), round(x1, p), round(bottom, p)
80
+
81
+ def get_physical_cores():
82
+ count = psutil.cpu_count(logical=False)
83
+ return max(1, count if count else 1) # fallback = 1
84
+ cores = get_physical_cores()
85
+
86
+
87
+ def is_valid_cell(cell):
88
+ """Prüft, ob eine Zelle mehr als nur Leerzeichen oder ein einzelnes Zeichen enthält."""
89
+ if cell is None:
90
+ return False
91
+ content = str(cell).strip()
92
+ return len(content) > 1
93
+
94
+
95
+ def block_area(block):
96
+ x0 = min(w["x0"] for w in block)
97
+ x1 = max(w["x1"] for w in block)
98
+ top = min(w["top"] for w in block)
99
+ bottom = max(w["bottom"] for w in block)
100
+ return (x1 - x0) * (bottom - top)
101
+
102
+
103
+ suppress_pdfminer_logging()
104
+
105
+ # -------------------- Status-Tracking --------------------
106
+ class StatusTracker:
107
+ def __init__(self, total_pages):
108
+ self.start_time = time.time()
109
+ self.total_pages = total_pages
110
+ self.processed_pages = 0
111
+
112
+ def update(self, n=1):
113
+ self.processed_pages += n
114
+
115
+ def get_status(self):
116
+ elapsed = time.time() - self.start_time
117
+ pages_per_sec = round(self.processed_pages / elapsed) if elapsed > 0 else 0
118
+ remaining_pages = self.total_pages - self.processed_pages
119
+ est_time = (remaining_pages / pages_per_sec) / 60 if pages_per_sec > 0 else float('inf')
120
+ return {
121
+ "processed_pages": self.processed_pages,
122
+ "total_pages": self.total_pages,
123
+ "pages_per_sec": pages_per_sec,
124
+ "elapsed_time": round(elapsed / 60, 1),
125
+ "est_time": round(est_time, 1)
126
+ }
127
+
128
+
129
+ # -------------------- PDF Verarbeitung --------------------
130
+ def process_page_worker(args):
131
+ suppress_pdfminer_logging()
132
+ try:
133
+ page_number, path = args
134
+ with pdfplumber.open(path) as pdf:
135
+ page = pdf.pages[page_number]
136
+ width, height = page.width, page.height
137
+ margin_x, margin_y = width * 0.04, height * 0.04
138
+
139
+ cropped_page = page.crop((margin_x, margin_y, width - margin_x, height - margin_y))
140
+ table_bboxes = [clamp_bbox(t.bbox, width, height) for t in cropped_page.find_tables()]
141
+ extracted_tables = cropped_page.extract_tables({"text_x_tolerance": 1.5})
142
+ tables_json = []
143
+
144
+ for raw_table in extracted_tables:
145
+ if not raw_table or len(raw_table) < 2:
146
+ continue # Weniger als 2 Zeilen
147
+
148
+ # Prüfe auf mindestens 2 Spalten
149
+ if all(len(row) < 2 for row in raw_table if row):
150
+ continue
151
+
152
+ # Leere oder fast leere Tabellen (nur Leerzeichen oder 1 Zeichen pro Zelle) ausschließen
153
+ if all(all(not is_valid_cell(cell) for cell in row) for row in raw_table):
154
+ continue
155
+
156
+ cleaned_table = [[clean_cell_text(c) for c in row] for row in raw_table]
157
+ header_row = cleaned_table[0]
158
+ is_corner_empty = header_row[0].strip() == ""
159
+
160
+ if is_corner_empty:
161
+ col_headers = cleaned_table[0][1:]
162
+ row_headers = [row[0] for row in cleaned_table[1:]]
163
+ data_rows = cleaned_table[1:]
164
+
165
+ table_data = {}
166
+ for row_header, row in zip(row_headers, data_rows):
167
+ row_dict = {}
168
+ for col_header, cell in zip(col_headers, row[1:]):
169
+ row_dict[col_header] = cell
170
+ table_data[row_header] = row_dict
171
+ else:
172
+ headers = header_row
173
+ data_rows = cleaned_table[1:]
174
+ table_data = []
175
+ for row in data_rows:
176
+ if len(row) == len(headers):
177
+ table_data.append(dict(zip(headers, row)))
178
+
179
+ tables_json.append(json.dumps(table_data, indent=1, ensure_ascii=False))
180
+
181
+
182
+ words = []
183
+ for w in cropped_page.extract_words(**TEXT_EXTRACTION_SETTINGS):
184
+ x0, top = float(w["x0"]), float(w["top"])
185
+ if any(bx0 <= x0 <= bx2 and by0 <= top <= by3 for bx0, by0, bx2, by3 in table_bboxes):
186
+ continue
187
+ if EUROPEAN_PRINTABLES_PATTERN.search(w["text"]):
188
+ continue
189
+ words.append(w)
190
+
191
+ def is_bold(fontname: str) -> bool:
192
+ fontname = fontname.lower()
193
+ return "bold" in fontname or "bd" in fontname or "black" in fontname
194
+
195
+ word_info = []
196
+ font_sizes = []
197
+ for w in words:
198
+ x0 = float(w["x0"])
199
+ x1 = float(w["x1"])
200
+ top = float(w["top"])
201
+ bottom = float(w["bottom"])
202
+ text = w["text"]
203
+
204
+ chars = [c for c in page.chars if x0 <= float(c["x0"]) <= x1 and top <= float(c["top"]) <= bottom]
205
+ sizes = [float(c.get("size", 0)) for c in chars if c.get("text", "").strip()]
206
+ fonts = [c.get("fontname", "") for c in chars]
207
+ bold_flags = [is_bold(c.get("fontname", "")) for c in chars]
208
+
209
+ font_size = max(sizes) if sizes else 0
210
+ font_sizes.append(font_size)
211
+ font_name = fonts[0] if fonts else "Unknown"
212
+ bold_flag = any(bold_flags)
213
+
214
+ word_info.append({
215
+ "text": text,
216
+ "top": round(top, 1),
217
+ "bottom": round(bottom, 1),
218
+ "font_size": font_size,
219
+ "font_name": font_name,
220
+ "bold_flag": bold_flag,
221
+ "x0": round(x0, 1),
222
+ "x1": round(x1, 1),
223
+ })
224
+
225
+
226
+
227
+ avg_fontsize = sum(font_sizes) / len(font_sizes) if font_sizes else 0
228
+
229
+ # Abstandsschwellen
230
+ MAX_DIST_X = 9
231
+ MAX_DIST_Y = 10
232
+
233
+ def are_words_close(w1, w2):
234
+ # Prüfe, ob Wörter räumlich nah beieinander liegen
235
+ dx = max(0, max(w1["x0"], w2["x0"]) - min(w1["x1"], w2["x1"]))
236
+ dy = max(0, max(w1["top"], w2["top"]) - min(w1["bottom"], w2["bottom"]))
237
+ return dx <= MAX_DIST_X and dy <= MAX_DIST_Y
238
+
239
+ def group_into_blocks(words):
240
+ blocks = []
241
+ unvisited = set(range(len(words)))
242
+ while unvisited:
243
+ idx = unvisited.pop()
244
+ block = {idx}
245
+ to_visit = {idx}
246
+ while to_visit:
247
+ current = to_visit.pop()
248
+ for other in list(unvisited):
249
+ if are_words_close(words[current], words[other]):
250
+ block.add(other)
251
+ to_visit.add(other)
252
+ unvisited.remove(other)
253
+ blocks.append([words[i] for i in block])
254
+ return blocks
255
+
256
+ def group_block_into_lines(block, line_tolerance=2.5):
257
+ # Gruppiere Wörter innerhalb eines Blocks in Zeilen (nach Y-Koordinate)
258
+ sorted_words = sorted(block, key=lambda w: w["top"])
259
+ lines = []
260
+ #lines = [sorted(block, key=lambda w: w["x0"])]
261
+ current_line = [sorted_words[0]]
262
+ current_top = sorted_words[0]["top"]
263
+
264
+ for word in sorted_words[1:]:
265
+ if abs(word["top"] - current_top) <= line_tolerance:
266
+ current_line.append(word)
267
+ else:
268
+ lines.append(sorted(current_line, key=lambda w: w["x0"]))
269
+ current_line = [word]
270
+ current_top = word["top"]
271
+ if current_line:
272
+ lines.append(sorted(current_line, key=lambda w: w["x0"]))
273
+ return lines
274
+
275
+
276
+ blocks = group_into_blocks(word_info)
277
+
278
+ SORT_TOLERANCE = 1 # e.g. 1 point distance
279
+
280
+ def round_to_nearest(value, tolerance):
281
+ return round(value / tolerance) * tolerance
282
+
283
+ def get_block_reference(block):
284
+ min_x0 = min(w["x0"] for w in block)
285
+ min_top = min(w["top"] for w in block)
286
+ return (
287
+ round_to_nearest(min_x0, SORT_TOLERANCE),
288
+ round_to_nearest(min_top, SORT_TOLERANCE),
289
+ )
290
+
291
+ # Sort blocks first by x0, then by top (row beginning)
292
+ sorted_blocks = sorted(blocks, key=get_block_reference)
293
+
294
+ '''
295
+ # Visualisierung: Blocks als Rechtecke zeichnen
296
+ im = page.to_image(resolution=150) # ggf. Auflösung anpassen
297
+
298
+ for block in blocks:
299
+ # Grenzen berechnen
300
+ x0 = min(w["x0"] for w in block)
301
+ top = min(w["top"] for w in block)
302
+ x1 = max(w["x1"] for w in block)
303
+ bottom = max(w["bottom"] for w in block)
304
+
305
+ # Rechteck zeichnen (blauer Rahmen, Dicke 1)
306
+ im.draw_rect((x0, top, x1, bottom), stroke="blue", stroke_width=1)
307
+
308
+ # Bild speichern – Dateiname z. B. mit Seitenzahl
309
+ im.save(f"page_{page_number + 1}_blocks.png")
310
+ '''
311
+
312
+ output_lines = []
313
+ output_lines.append(f"\nPage {page_number + 1}, Seite {page_number + 1}, Página {page_number + 1}\n") # Seitenzahl
314
+
315
+ for block_idx, block in enumerate(sorted_blocks, 1):
316
+ lines = group_block_into_lines(block)
317
+
318
+ chapter_hits = 0
319
+ important_hits = 0
320
+ block_label = None # Initialisierung hier
321
+
322
+ # Regel 1: Nur Wörter mit mehr als 3 Zeichen und keine reinen Zahlen
323
+ for w in block:
324
+ text = w["text"]
325
+ if len(text) <= 5 or text.isdigit():
326
+ continue # Regel 1 – alle anderen Regeln überspringen
327
+
328
+ size_ratio = w["font_size"] / avg_fontsize if avg_fontsize else 0
329
+ bold_flag = w["bold_flag"]
330
+
331
+ # Regel 2 – Vorrangig
332
+ if size_ratio >= 1.15:
333
+ chapter_hits += 1
334
+ # Regel 3 – Wenn Regel 2 nicht greift
335
+ elif bold_flag and size_ratio >= 1:
336
+ important_hits += 1
337
+
338
+ total_hits = chapter_hits + important_hits
339
+
340
+ # Regel 4 – Entscheidung auf Basis der Anzahl Treffer
341
+ if total_hits > 1:
342
+ block_label = "IMPORTANT"
343
+ elif total_hits == 1:
344
+ if chapter_hits == 1:
345
+ block_label = "CHAPTER"
346
+ elif important_hits == 1:
347
+ block_label = "IMPORTANT"
348
+
349
+ output_lines.append("") # Leerzeile vor Block
350
+
351
+ for line_idx, line in enumerate(lines):
352
+ line_text = " ".join(w["text"] for w in line)
353
+ if line_idx == 0 and block_label:
354
+ line_text = f"[{block_label}] {line_text}"
355
+ output_lines.append(line_text)
356
+
357
+
358
+
359
+ # Tabellen anhängen (wie gehabt)
360
+ for idx, tbl in enumerate(tables_json, 1):
361
+ output_lines.append(f'"table {idx}":\n{tbl}')
362
+
363
+ return page_number, "\n".join(output_lines)
364
+
365
+
366
+ except Exception as e:
367
+ msg = str(e).strip() or f"{type(e).__name__} (no message)"
368
+ return args[0], f"[ERROR] Seite {args[0]+1}: {msg}"
369
+
370
+
371
+
372
+ def run_serial(path, page_number, tracker=None, progress_callback=None, stop_flag=None):
373
+ results = []
374
+ for i in range(page_number):
375
+ if stop_flag and stop_flag.is_set():
376
+ break
377
+ result = process_page_worker((i, path,))
378
+ results.append(result)
379
+ if tracker is not None:
380
+ tracker.update()
381
+ if progress_callback and tracker is not None:
382
+ report_status(tracker, progress_callback)
383
+ return results
384
+
385
+
386
+
387
+
388
+ def run_parallel(path, page_number, tracker=None, progress_callback=None, stop_flag=None):
389
+ args = [(i, path) for i in range(page_number)] # stop_flag entfernt
390
+ results = [None] * page_number
391
+
392
+ def callback(result):
393
+ if result is None:
394
+ return
395
+ page, _ = result
396
+ results[page] = result
397
+ if tracker is not None:
398
+ tracker.update()
399
+ if progress_callback and tracker is not None:
400
+ report_status(tracker, progress_callback)
401
+
402
+ with concurrent.futures.ProcessPoolExecutor(
403
+ max_workers=min(page_number, get_physical_cores())
404
+ ) as executor:
405
+ futures = {executor.submit(process_page_worker, arg): arg for arg in args}
406
+ for future in concurrent.futures.as_completed(futures):
407
+ # stop_flag nicht hier prüfen, sondern im Hauptthread
408
+ callback(future.result())
409
+
410
+ return [r for r in results if r]
411
+
412
+
413
+ def report_status(tracker, progress_callback=None):
414
+ status = tracker.get_status()
415
+ if progress_callback:
416
+ progress_callback(status)
417
+ else:
418
+ print(f"[STATUS] {status['processed_pages']}/{status['total_pages']} Seiten "
419
+ f"({status['pages_per_sec']:} Seiten/s, "
420
+ f"Elapsed: {status['elapsed_time']} Sek.)"
421
+ f"Est Time: {status['est_time']} Sek.)")
422
+
423
+
424
+ def save_pdf(path, page_number, tracker=None, parallel=False, progress_callback=None, stop_flag=None):
425
+ if stop_flag and stop_flag.is_set():
426
+ return 0
427
+
428
+ if parallel:
429
+ results = run_parallel(path, page_number, tracker, progress_callback, stop_flag)
430
+ else:
431
+ results = run_serial(path, page_number, tracker, progress_callback, stop_flag)
432
+
433
+ results = [r for r in results if r] # Filter None (bei Stop)
434
+
435
+ results.sort(key=lambda x: x[0])
436
+ text_output = "\n".join(text for _, text in results)
437
+
438
+ out_path = os.path.splitext(path)[0] + ".txt"
439
+ with open(out_path, "w", encoding="utf-8", errors="ignore") as f:
440
+ f.write(text_output)
441
+
442
+ return page_number
443
+
444
+
445
+
446
+ def _process_single_pdf(path):
447
+ suppress_pdfminer_logging()
448
+ try:
449
+ with open(path, "rb") as f:
450
+ parser = PDFParser(f)
451
+ document = PDFDocument(parser)
452
+
453
+ if not document.is_extractable:
454
+ raise PDFTextExtractionNotAllowed("Text-Extraktion nicht erlaubt")
455
+
456
+ pages = list(PDFPage.create_pages(document))
457
+ return (path, len(pages), None)
458
+
459
+ except (PDFEncryptionError, PDFPasswordIncorrect) as e:
460
+ return (path, 0, f"[ERROR] Datei passwortgeschützt: {path} ({type(e).__name__}: {e})\n")
461
+ except PDFSyntaxError as e:
462
+ return (path, 0, f"[ERROR] Ungültige PDF-Syntax: {path} ({type(e).__name__}: {e})\n")
463
+ except PDFTextExtractionNotAllowed as e:
464
+ return (path, 0, f"[ERROR] Text-Extraktion nicht erlaubt: {path} ({type(e).__name__}: {e})\n")
465
+ except Exception as e:
466
+ return (path, 0, f"[ERROR] Fehler bei Datei {path}: {type(e).__name__}: {e}\n")
467
+
468
+ def get_total_pages(pdf_files, error_callback=None, progress_callback=None):
469
+ suppress_pdfminer_logging()
470
+ total = 0
471
+ page_info = []
472
+
473
+ def handle_result(path, count, error):
474
+ nonlocal total
475
+ if error:
476
+ if error_callback:
477
+ error_callback(error)
478
+ else:
479
+ print(error, end="")
480
+ else:
481
+ page_info.append((path, count))
482
+ total += count
483
+ if progress_callback:
484
+ progress_callback(total) # Rückmeldung an GUI
485
+
486
+ if len(pdf_files) > 14:
487
+ with concurrent.futures.ProcessPoolExecutor(max_workers=cores) as executor:
488
+ results = executor.map(_process_single_pdf, pdf_files)
489
+ for path, count, error in results:
490
+ handle_result(path, count, error)
491
+ else:
492
+ for path in pdf_files:
493
+ path, count, error = _process_single_pdf(path)
494
+ handle_result(path, count, error)
495
+
496
+ return page_info, total
497
+
498
+
499
+
500
+
501
+ # -------------------- GUI --------------------
502
+ class FileManager(wx.Frame):
503
+ def __init__(self, parent):
504
+ super().__init__(parent, title="PDF Parser - Sevenof9_v7d", size=(1000, 800))
505
+ self.files = []
506
+ self.InitUI()
507
+ self.stop_flag = threading.Event()
508
+
509
+ def InitUI(self):
510
+ panel = wx.Panel(self)
511
+ vbox = wx.BoxSizer(wx.VERTICAL)
512
+
513
+ hbox_lbl1 = wx.BoxSizer(wx.HORIZONTAL)
514
+
515
+ lbl1 = wx.StaticText(panel, label="Filed PDF files: (with right mouse you can remove and open)")
516
+ hbox_lbl1.Add(lbl1, flag=wx.ALIGN_CENTER_VERTICAL | wx.LEFT, border=10)
517
+
518
+ hbox_lbl1.AddStretchSpacer() # <== schiebt den Button ganz nach rechts
519
+
520
+ help_btn = wx.Button(panel, label="? HELP ?", size=(60, 25))
521
+ help_btn.Bind(wx.EVT_BUTTON, self.ShowHelpText)
522
+ hbox_lbl1.Add(help_btn, flag=wx.RIGHT, border=10)
523
+
524
+ vbox.Add(hbox_lbl1, flag=wx.EXPAND | wx.TOP, border=10)
525
+
526
+
527
+ self.listbox = wx.ListBox(panel, style=wx.LB_EXTENDED)
528
+ self.listbox.Bind(wx.EVT_RIGHT_DOWN, self.OnRightClick)
529
+ self.listbox.Bind(wx.EVT_LISTBOX, self.ShowText)
530
+ vbox.Add(self.listbox, proportion=1, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
531
+
532
+ self.popup_menu = wx.Menu()
533
+ self.popup_menu.Append(1, "Remove selected")
534
+ self.popup_menu.Append(2, "Open in default PDF app")
535
+ self.popup_menu.Append(3, "Copy File Location")
536
+ self.popup_menu.Append(4, "Open File Location")
537
+ self.Bind(wx.EVT_MENU, self.RemoveFile, id=1)
538
+ self.Bind(wx.EVT_MENU, self.OpenPDF, id=2)
539
+ self.Bind(wx.EVT_MENU, self.CopyFileLocation, id=3)
540
+ self.Bind(wx.EVT_MENU, self.OpenFileLocation, id=4)
541
+
542
+
543
+ btn_panel = wx.Panel(panel)
544
+ btn_sizer = wx.BoxSizer(wx.HORIZONTAL)
545
+ for label, handler in [
546
+ ("Add Folder", self.AddFolder),
547
+ ("Select Files", self.AddFile),
548
+ ("Remove Selected", self.RemoveFile),
549
+ ("Remove All", self.RemoveAll),
550
+ ("Stop Parser", self.StopParser),
551
+ ("Start Parser", self.StartParser)
552
+ ]:
553
+ btn = wx.Button(btn_panel, label=label)
554
+ btn.Bind(wx.EVT_BUTTON, handler)
555
+ if label == "Start Parser":
556
+ self.start_btn = btn # <-- Referenz merken
557
+ btn_sizer.Add(btn, proportion=1, flag=wx.ALL, border=5)
558
+ btn_panel.SetSizer(btn_sizer)
559
+ vbox.Add(btn_panel, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
560
+
561
+
562
+ lbl2 = wx.StaticText(panel, label="Text Frame: (choose PDF to see converted text)")
563
+ vbox.Add(lbl2, flag=wx.LEFT, border=10)
564
+
565
+ self.text_ctrl = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY)
566
+ self.ShowHelpText(None)
567
+ vbox.Add(self.text_ctrl, proportion=1, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
568
+
569
+ # Statusanzeige
570
+ stat_grid = wx.FlexGridSizer(1, 5, 5, 55)
571
+ self.lbl_processed_pages = wx.StaticText(panel, label="Processed pages: 0")
572
+ self.lbl_total_pages = wx.StaticText(panel, label="Total pages: 0")
573
+ self.lbl_pages_per_sec = wx.StaticText(panel, label="Pages/sec: 0")
574
+ self.lbl_est_time = wx.StaticText(panel, label="Estimated time (min): 0.0")
575
+ self.lbl_elapsed_time = wx.StaticText(panel, label="Elapsed time: 0.0")
576
+
577
+ for lbl in [self.lbl_processed_pages, self.lbl_total_pages, self.lbl_pages_per_sec, self.lbl_est_time, self.lbl_elapsed_time]:
578
+ stat_grid.Add(lbl)
579
+ vbox.Add(stat_grid, flag=wx.LEFT | wx.TOP, border=10)
580
+
581
+ self.prog_ctrl = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY)
582
+ vbox.Add(self.prog_ctrl, proportion=1, flag=wx.EXPAND | wx.ALL, border=10)
583
+
584
+ panel.SetSizer(vbox)
585
+
586
+
587
+ def ShowHelpText(self, event):
588
+ help_text = (
589
+ " This is a small help\n\n"
590
+ " • PRE ALPHA version (for ever) •\n"
591
+ "• The generated TXT file has the same name as the PDF file\n"
592
+ "• The TXT file is created in the same directory as the PDF\n"
593
+ "• Older TXT files will be overwritten without prompting\n"
594
+ "• When selecting a folder, subfolders are also selected\n"
595
+ "If:\n"
596
+ "[INFO] File completed: TEST.pdf (X pages)!\n"
597
+ "[INFO] Processing completed\n"
598
+ "-> This only means that all pages have been processed; it does not mean that the quality is good.\n"
599
+ "• An attempt is made to reproduce the layout of the page in columns from left to right and in blocks from top to bottom\n"
600
+ "• An attempt is made to detect regular tables with lines; headers (top or top and left) are assigned to the cells and stored in JSON format in the text file\n"
601
+ "\n"
602
+ "Stop function becomes effective only after the currently processed file\n"
603
+ "When processing large amounts of data, the following should be noted:\n"
604
+ "First, all PDFs are opened once to determine the number of pages:\n"
605
+ "Then, all small PDFs are processed in parallel:\n"
606
+ "Then, each large PDF is processed page by page in parallel:\n"
607
+ )
608
+ self.text_ctrl.SetValue(help_text)
609
+
610
+
611
+ def AddFolder(self, event):
612
+ dlg = wx.DirDialog(self, "Select Folder")
613
+ if dlg.ShowModal() == wx.ID_OK:
614
+ for root, _, files in os.walk(dlg.GetPath()):
615
+ for f in files:
616
+ if f.lower().endswith(".pdf"):
617
+ path = os.path.normpath(os.path.join(root, f))
618
+ if path not in self.files:
619
+ self.files.append(path)
620
+ self.listbox.Append(path)
621
+ dlg.Destroy()
622
+
623
+ def AddFile(self, event):
624
+ with wx.FileDialog(self, "Select PDF Files", wildcard="PDF files (*.pdf)|*.pdf",
625
+ style=wx.FD_OPEN | wx.FD_MULTIPLE) as dlg:
626
+ if dlg.ShowModal() == wx.ID_OK:
627
+ for path in dlg.GetPaths():
628
+ if path not in self.files:
629
+ self.files.append(path)
630
+ self.listbox.Append(path)
631
+
632
+ def RemoveFile(self, event):
633
+ for i in reversed(self.listbox.GetSelections()):
634
+ self.listbox.Delete(i)
635
+ del self.files[i]
636
+ self.text_ctrl.Clear()
637
+
638
+ def RemoveAll(self, event):
639
+ self.listbox.Clear()
640
+ self.files.clear()
641
+ self.text_ctrl.Clear()
642
+
643
+ def OpenPDF(self, event):
644
+ i = self.listbox.GetSelections()
645
+ if i:
646
+ path = self.files[i[0]]
647
+ if platform.system() == "Windows":
648
+ os.startfile(path)
649
+ elif platform.system() == "Darwin":
650
+ subprocess.call(["open", path])
651
+ else:
652
+ subprocess.call(["xdg-open", path])
653
+
654
+ def CopyFileLocation(self, event):
655
+ sel = self.listbox.GetSelections()
656
+ if sel:
657
+ path = self.files[sel[0]]
658
+ if wx.TheClipboard.Open():
659
+ wx.TheClipboard.SetData(wx.TextDataObject(path))
660
+ wx.TheClipboard.Close()
661
+
662
+ def OpenFileLocation(self, event):
663
+ sel = self.listbox.GetSelections()
664
+ if sel:
665
+ folder = os.path.dirname(self.files[sel[0]])
666
+ if platform.system() == "Windows":
667
+ subprocess.Popen(f'explorer "{folder}"')
668
+ elif platform.system() == "Darwin":
669
+ subprocess.call(["open", folder])
670
+ else:
671
+ subprocess.call(["xdg-open", folder])
672
+
673
+
674
+ def OnRightClick(self, event):
675
+ if self.listbox.GetSelections():
676
+ self.PopupMenu(self.popup_menu, event.GetPosition())
677
+
678
+ def StartParser(self, event):
679
+ if not self.files:
680
+ wx.MessageBox("Please select files first.", "Hinweis", wx.OK | wx.ICON_INFORMATION)
681
+ wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
682
+ return
683
+
684
+
685
+ self.start_btn.Disable()
686
+ self.stop_flag.clear()
687
+ self.prog_ctrl.Clear()
688
+
689
+ def error_callback(msg):
690
+ wx.CallAfter(self.AppendProg, msg)
691
+
692
+ def update_total_pages_live(new_total):
693
+ wx.CallAfter(self.lbl_total_pages.SetLabel, f"Total pages: {new_total}")
694
+
695
+
696
+ page_info, total_pages = get_total_pages(
697
+ self.files,
698
+ error_callback=error_callback,
699
+ progress_callback=update_total_pages_live
700
+ )
701
+
702
+ if total_pages == 0:
703
+ self.AppendProg("[INFO] No pages found.\n")
704
+ wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
705
+ return
706
+
707
+ tracker = StatusTracker(total_pages)
708
+
709
+ def gui_progress_callback(status):
710
+ wx.CallAfter(self.lbl_processed_pages.SetLabel, f"Processed pages: {status['processed_pages']}")
711
+ wx.CallAfter(self.lbl_total_pages.SetLabel, f"Total pages: {status['total_pages']}")
712
+ wx.CallAfter(self.lbl_pages_per_sec.SetLabel, f"Pages/sec: {status['pages_per_sec']:}")
713
+ wx.CallAfter(self.lbl_est_time.SetLabel, f"Estimated time (min): {status['est_time']:}")
714
+ wx.CallAfter(self.lbl_elapsed_time.SetLabel, f"Elapsed time: {status['elapsed_time']}")
715
+
716
+ throttled_gui_callback = throttle_callback(gui_progress_callback, 100)
717
+
718
+ def background():
719
+ small = [p for p in page_info if p[1] <= PARALLEL_THRESHOLD]
720
+ large = [p for p in page_info if p[1] > PARALLEL_THRESHOLD]
721
+
722
+ # Verarbeite kleine Dateien je in einem eigenen Prozess
723
+ if small:
724
+ max_workers = max(1, min(len(small), get_physical_cores()))
725
+ with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
726
+ futures = {}
727
+ for path, count in small:
728
+ if self.stop_flag.is_set():
729
+ break
730
+ future = executor.submit(save_pdf, path, count, None, False, None)
731
+ futures[future] = (path, count)
732
+
733
+ for future in concurrent.futures.as_completed(futures):
734
+ if self.stop_flag.is_set():
735
+ break
736
+ path, count = futures[future]
737
+ try:
738
+ pages_processed = future.result()
739
+ tracker.update(pages_processed)
740
+ throttled_gui_callback(tracker.get_status())
741
+ wx.CallAfter(self.AppendProg, f"[INFO] File ready: {path} ({pages_processed} Seiten)\n")
742
+ except Exception as e:
743
+ wx.CallAfter(self.AppendProg, f"[ERROR] File {path}: {str(e)}\n")
744
+
745
+ # Verarbeite große Dateien Seite für Seite parallel
746
+ for path, count in large:
747
+ if self.stop_flag.is_set():
748
+ break
749
+
750
+ try:
751
+ pages_processed = save_pdf(
752
+ path,
753
+ count,
754
+ tracker,
755
+ parallel=True,
756
+ progress_callback=throttled_gui_callback,
757
+ stop_flag=self.stop_flag
758
+ )
759
+ if pages_processed:
760
+ wx.CallAfter(
761
+ self.AppendProg,
762
+ f"[INFO] File ready: {path} ({pages_processed} Seiten)\n"
763
+ )
764
+ else:
765
+ wx.CallAfter(
766
+ self.AppendProg,
767
+ f"[INFO] Stopped: {path}\n"
768
+ )
769
+ except Exception as e:
770
+ wx.CallAfter(
771
+ self.AppendProg,
772
+ f"[ERROR] File {path}: {str(e)}\n"
773
+ )
774
+
775
+
776
+
777
+ wx.CallAfter(self.AppendProg, "\n[INFO] Processing completed.\n")
778
+ wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
779
+ self.stop_flag.clear()
780
+
781
+ threading.Thread(target=background, daemon=True).start()
782
+
783
+
784
+ def StopParser(self, event):
785
+ self.stop_flag.set()
786
+ self.AppendProg("[INFO] Processing Stopped...\n")
787
+
788
+
789
+ def ShowText(self, event):
790
+ sel = self.listbox.GetSelections()
791
+ if not sel:
792
+ return
793
+ txt_path = os.path.splitext(self.files[sel[0]])[0] + ".txt"
794
+ self.text_ctrl.Clear()
795
+ if os.path.exists(txt_path):
796
+ with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
797
+ self.text_ctrl.SetValue(f.read())
798
+ else:
799
+ self.text_ctrl.SetValue("[No .txt file found]")
800
+
801
+ def AppendProg(self, text):
802
+ self.prog_ctrl.AppendText(text)
803
+
804
+
805
+ # -------------------- Einstiegspunkt --------------------
806
+ def main():
807
+ if len(sys.argv) > 1:
808
+ pdf_files = sys.argv[1:]
809
+ page_info, total_pages = get_total_pages(pdf_files)
810
+ tracker = StatusTracker(total_pages)
811
+
812
+ def cli_callback(status):
813
+ print(json.dumps(status))
814
+
815
+ for path, count in page_info:
816
+ save_pdf(path, count, tracker, parallel=(count > PARALLEL_THRESHOLD), progress_callback=cli_callback)
817
+ else:
818
+ app = wx.App(False)
819
+ frame = FileManager(None)
820
+ frame.Show()
821
+ app.MainLoop()
822
+
823
+
824
+ if __name__ == "__main__":
825
+ multiprocessing.freeze_support()
826
+ main()