kalle07 commited on
Commit
458f010
·
verified ·
1 Parent(s): b853362

Fixed second round exe compiling for multiprocessing and joblib

Browse files
.gitattributes CHANGED
@@ -36,3 +36,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
36
  parser_sevenof9_v1_de.exe filter=lfs diff=lfs merge=lfs -text
37
  parser_sevenof9_v1_en.exe filter=lfs diff=lfs merge=lfs -text
38
  parser_sevenof9_v1_1_en.exe filter=lfs diff=lfs merge=lfs -text
 
 
36
  parser_sevenof9_v1_de.exe filter=lfs diff=lfs merge=lfs -text
37
  parser_sevenof9_v1_en.exe filter=lfs diff=lfs merge=lfs -text
38
  parser_sevenof9_v1_1_en.exe filter=lfs diff=lfs merge=lfs -text
39
+ parser_sevenof9_v1_2.exe filter=lfs diff=lfs merge=lfs -text
parser_sevenof9_v1_2.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2e8db416754c1b9d569588829a75376a8eab36ea60cf8e66e52c87e1832a7ad
3
+ size 25758412
parser_sevenof9_v1_2_en.py ADDED
@@ -0,0 +1,446 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import tkinter as tk # internal
4
+ from tkinter import filedialog, messagebox # internal
5
+ import subprocess
6
+ import threading
7
+ import tempfile
8
+ import shutil
9
+ import json
10
+ import logging
11
+ import pdfplumber
12
+ from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
13
+ from pdfplumber.utils.exceptions import PdfminerException
14
+ from joblib import delayed, cpu_count, Parallel
15
+ import multiprocessing # intternal
16
+ from multiprocessing import Pool # internal
17
+ from multiprocessing import set_executable
18
+
19
+
20
+ # ========================
21
+ # Parser Configuration
22
+ # ========================
23
+
24
+ TEXT_EXTRACTION_SETTINGS = {
25
+ "x_tolerance": 1,
26
+ "y_tolerance": 3,
27
+ "keep_blank_chars": False,
28
+ "use_text_flow": True
29
+ }
30
+
31
+ if sys.platform == "win32":
32
+ sys.stderr = open(os.devnull, 'w')
33
+
34
+ PARALLEL_THRESHOLD = 16
35
+
36
+ def suppress_pdfminer_logging():
37
+ for logger_name in [
38
+ "pdfminer",
39
+ "pdfminer.pdfparser",
40
+ "pdfminer.pdfdocument",
41
+ "pdfminer.pdfpage",
42
+ "pdfminer.converter",
43
+ "pdfminer.layout",
44
+ "pdfminer.cmapdb",
45
+ "pdfminer.utils"
46
+ ]:
47
+ logging.getLogger(logger_name).setLevel(logging.ERROR)
48
+
49
+ def clean_cell_text(text):
50
+ if not isinstance(text, str):
51
+ return ""
52
+ text = text.replace("-\n", "").replace("\n", " ")
53
+ return " ".join(text.split())
54
+
55
+ def safe_join(row):
56
+ return [clean_cell_text(str(cell)) if cell is not None else "" for cell in row]
57
+
58
+ def clamp_bbox(bbox, page_width, page_height):
59
+ x0, top, x1, bottom = bbox
60
+ x0 = max(0, min(x0, page_width))
61
+ x1 = max(0, min(x1, page_width))
62
+ top = max(0, min(top, page_height))
63
+ bottom = max(0, min(bottom, page_height))
64
+ return (x0, top, x1, bottom)
65
+
66
+ def process_page(args):
67
+ suppress_pdfminer_logging()
68
+ try:
69
+ page_number, pdf_path, text_settings = args
70
+ with pdfplumber.open(pdf_path) as pdf:
71
+ page = pdf.pages[page_number]
72
+ output = f"Page {page_number + 1}\n"
73
+ width, height = page.width, page.height
74
+
75
+ filtered_page = page
76
+ table_bboxes = []
77
+ table_json_outputs = []
78
+
79
+ for table in page.find_tables():
80
+ bbox = clamp_bbox(table.bbox, width, height)
81
+ table_bboxes.append(bbox)
82
+
83
+ if not page.crop(bbox).chars:
84
+ continue
85
+
86
+ filtered_page = filtered_page.filter(
87
+ lambda obj: get_bbox_overlap(obj_to_bbox(obj), bbox) is None
88
+ )
89
+
90
+ table_data = table.extract()
91
+ if table_data and len(table_data) >= 1:
92
+ headers = safe_join(table_data[0])
93
+ rows = [safe_join(row) for row in table_data[1:]]
94
+ json_table = [dict(zip(headers, row)) for row in rows]
95
+ table_json_outputs.append(json.dumps(json_table, indent=1, ensure_ascii=False))
96
+
97
+ words_outside_tables = [
98
+ word for word in page.extract_words(**text_settings)
99
+ if not any(
100
+ bbox[0] <= float(word['x0']) <= bbox[2] and
101
+ bbox[1] <= float(word['top']) <= bbox[3]
102
+ for bbox in table_bboxes
103
+ )
104
+ ]
105
+
106
+ current_y = None
107
+ line = []
108
+ text_content = ""
109
+
110
+ for word in words_outside_tables:
111
+ if current_y is None or abs(word['top'] - current_y) > 10:
112
+ if line:
113
+ text_content += " ".join(line) + "\n"
114
+ line = [word['text']]
115
+ current_y = word['top']
116
+ else:
117
+ line.append(word['text'])
118
+ if line:
119
+ text_content += " ".join(line) + "\n"
120
+
121
+ output += text_content.strip() + "\n"
122
+
123
+ for idx, table in enumerate(table_json_outputs, start=1):
124
+ output += f'"table {idx}":\n{table}\n'
125
+
126
+ return page_number, output
127
+
128
+ except Exception as e:
129
+ return args[0], f"[ERROR] Page {args[0]+1} ({args[1]}): {str(e)}"
130
+
131
+ def process_pdf(pdf_path):
132
+ suppress_pdfminer_logging()
133
+ try:
134
+ if not os.path.exists(pdf_path):
135
+ return f"[ERROR] File not found: {pdf_path}"
136
+
137
+ print(f"[INFO] Starting processing: {pdf_path}")
138
+ try:
139
+ with pdfplumber.open(pdf_path) as pdf:
140
+ num_pages = len(pdf.pages)
141
+ except PdfminerException as e:
142
+ return f"[ERROR] Cannot open PDF: {pdf_path} – {str(e)}"
143
+ except Exception as e:
144
+ return f"[ERROR] General error opening PDF: {pdf_path} – {str(e)}"
145
+
146
+ pages = [(i, pdf_path, TEXT_EXTRACTION_SETTINGS) for i in range(num_pages)]
147
+
148
+ try:
149
+ results = run_serial(pages) if num_pages <= PARALLEL_THRESHOLD else run_parallel(pages)
150
+ except (EOFError, BrokenPipeError, KeyboardInterrupt):
151
+ return "[INFO] Processing was interrupted."
152
+
153
+ sorted_results = sorted(results, key=lambda x: x[0])
154
+ final_output = "\n".join(text for _, text in sorted_results)
155
+
156
+ base_name = os.path.splitext(os.path.basename(pdf_path))[0]
157
+ output_dir = os.path.dirname(pdf_path)
158
+ output_path = os.path.join(output_dir, f"{base_name}.txt")
159
+
160
+ with open(output_path, "w", encoding="utf-8", errors="ignore") as f:
161
+ f.write(final_output)
162
+
163
+ print(f"[INFO] Processing complete: {output_path}")
164
+
165
+ except (EOFError, BrokenPipeError, KeyboardInterrupt):
166
+ return "[INFO] Processing interrupted by user."
167
+ except Exception as e:
168
+ return f"[ERROR] Unexpected error with '{pdf_path}': {str(e)}"
169
+
170
+ def run_serial(pages):
171
+ return [process_page(args) for args in pages]
172
+
173
+ def run_parallel(pages):
174
+ available_cores = max(1, cpu_count() - 2)
175
+ num_cores = min(available_cores, len(pages))
176
+ print(f"Starting parallel processing with {num_cores} cores...")
177
+ with Pool(processes=num_cores) as pool:
178
+ return pool.map(process_page, pages)
179
+
180
+ def process_pdfs_main():
181
+ suppress_pdfminer_logging()
182
+ pdf_files = [arg for arg in sys.argv[1:] if arg.lower().endswith(".pdf") and os.path.isfile(arg)]
183
+ if not pdf_files:
184
+ print("No PDF files provided.")
185
+ return
186
+
187
+ small_pdfs = []
188
+ large_pdfs = []
189
+
190
+ for path in pdf_files:
191
+ if not os.path.exists(path):
192
+ print(f"File not found: {path}")
193
+ continue
194
+ try:
195
+ with pdfplumber.open(path) as pdf:
196
+ if len(pdf.pages) <= PARALLEL_THRESHOLD:
197
+ small_pdfs.append(path)
198
+ else:
199
+ large_pdfs.append(path)
200
+ except PdfminerException:
201
+ print(f"[ERROR] Password-protected PDF skipped: {path}")
202
+ except Exception as e:
203
+ print(f"[ERROR] Error opening {path}: {str(e)}")
204
+
205
+ if small_pdfs:
206
+ available_cores = max(1, cpu_count() - 2)
207
+ num_cores = min(available_cores, len(small_pdfs))
208
+ print(f"\n[Phase 1] Starting parallel processing of small PDFs with {num_cores} cores...")
209
+ results = Parallel(n_jobs=num_cores)(
210
+ delayed(process_pdf)(path) for path in small_pdfs
211
+ )
212
+ for r in results:
213
+ print(r)
214
+
215
+ for path in large_pdfs:
216
+ print(f"\n[Phase 2] Processing large PDF: {os.path.basename(path)}")
217
+ print(process_pdf(path))
218
+
219
+
220
+ # ========================
221
+ # GUI Class
222
+ # ========================
223
+
224
+ class FileManager:
225
+ def __init__(self, master):
226
+ self.master = master
227
+ self.master.title("Parser-Sevenof9")
228
+ self.files = []
229
+ self.last_selected_index = None
230
+
231
+ self.label = tk.Label(master, text="Selected PDF files:")
232
+ self.label.pack(pady=5)
233
+
234
+ listbox_frame = tk.Frame(master)
235
+ listbox_frame.pack(pady=5)
236
+
237
+ scrollbar_listbox = tk.Scrollbar(listbox_frame)
238
+ self.listbox = tk.Listbox(listbox_frame, selectmode=tk.MULTIPLE, width=80, height=6, yscrollcommand=scrollbar_listbox.set)
239
+ scrollbar_listbox.config(command=self.listbox.yview)
240
+
241
+ self.listbox.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
242
+ scrollbar_listbox.pack(side=tk.RIGHT, fill=tk.Y)
243
+
244
+ self.listbox.bind("<<ListboxSelect>>", self.show_text_file)
245
+ self.listbox.bind("<Button-1>", self.on_listbox_click)
246
+ self.listbox.bind("<Shift-Button-1>", self.on_listbox_shift_click)
247
+
248
+ self.context_menu = tk.Menu(master, tearoff=0)
249
+ self.context_menu.add_command(label="Remove selected", command=self.remove_file)
250
+ self.listbox.bind("<Button-3>", self.show_context_menu)
251
+
252
+ self.frame = tk.Frame(master)
253
+ self.frame.pack(pady=10)
254
+
255
+ tk.Button(self.frame, text="Add Folder", command=self.add_folder).pack(side=tk.LEFT, padx=5)
256
+ tk.Button(self.frame, text="Select Files", command=self.add_file).pack(side=tk.LEFT, padx=5)
257
+ tk.Button(self.frame, text="Remove Selected", command=self.remove_file).pack(side=tk.LEFT, padx=5)
258
+ tk.Button(self.frame, text="Remove All", command=self.remove_all).pack(side=tk.LEFT, padx=5)
259
+ tk.Button(master, text="Stop", command=self.stop_parser).pack(pady=5)
260
+ self.parser_process = None # Will be stored in thread
261
+
262
+ tk.Button(master, text="Start Parser", command=self.start_parser).pack(pady=10)
263
+
264
+ text_frame = tk.Frame(master)
265
+ text_frame.pack(padx=10, pady=5)
266
+
267
+ scrollbar_text = tk.Scrollbar(text_frame)
268
+ self.text_widget = tk.Text(text_frame, height=15, width=100, wrap=tk.WORD, yscrollcommand=scrollbar_text.set)
269
+ scrollbar_text.config(command=self.text_widget.yview)
270
+
271
+ self.text_widget.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
272
+ scrollbar_text.pack(side=tk.RIGHT, fill=tk.Y)
273
+
274
+ tk.Label(master, text="Progress:").pack()
275
+
276
+ progress_frame = tk.Frame(master)
277
+ progress_frame.pack(padx=10, pady=5)
278
+
279
+ scrollbar_progress = tk.Scrollbar(progress_frame)
280
+ self.progress_text = tk.Text(progress_frame, height=8, width=100, state=tk.DISABLED, yscrollcommand=scrollbar_progress.set)
281
+ scrollbar_progress.config(command=self.progress_text.yview)
282
+
283
+ self.progress_text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
284
+ scrollbar_progress.pack(side=tk.RIGHT, fill=tk.Y)
285
+
286
+ def on_listbox_click(self, event):
287
+ index = self.listbox.nearest(event.y)
288
+ self.listbox.selection_clear(0, tk.END)
289
+ self.listbox.selection_set(index)
290
+ self.last_selected_index = index
291
+ self.show_text_file(None)
292
+ return "break"
293
+
294
+ def on_listbox_shift_click(self, event):
295
+ index = self.listbox.nearest(event.y)
296
+ if self.last_selected_index is None:
297
+ self.last_selected_index = index
298
+ start, end = sorted((self.last_selected_index, index))
299
+ self.listbox.selection_clear(0, tk.END)
300
+ for i in range(start, end + 1):
301
+ self.listbox.selection_set(i)
302
+ return "break"
303
+
304
+ def show_context_menu(self, event):
305
+ if self.listbox.curselection():
306
+ self.context_menu.tk_popup(event.x_root, event.y_root)
307
+
308
+ def add_folder(self):
309
+ folder = filedialog.askdirectory(title="Select Folder")
310
+ if not folder:
311
+ return
312
+ for root, _, files in os.walk(folder):
313
+ for file in files:
314
+ if file.lower().endswith(".pdf"):
315
+ path = os.path.join(root, file)
316
+ if path not in self.files:
317
+ self.files.append(path)
318
+ self.listbox.insert(tk.END, path)
319
+
320
+ def add_file(self):
321
+ paths = filedialog.askopenfilenames(title="Select PDF Files", filetypes=[("PDF Files", "*.pdf")])
322
+ for path in paths:
323
+ if path not in self.files:
324
+ self.files.append(path)
325
+ self.listbox.insert(tk.END, path)
326
+
327
+ def remove_file(self):
328
+ selection = self.listbox.curselection()
329
+ if not selection:
330
+ messagebox.showwarning("Notice", "Please select an entry to remove.")
331
+ return
332
+ for index in reversed(selection):
333
+ self.listbox.delete(index)
334
+ del self.files[index]
335
+ self.text_widget.delete(1.0, tk.END)
336
+
337
+ def remove_all(self):
338
+ self.listbox.delete(0, tk.END)
339
+ self.files.clear()
340
+ self.text_widget.delete(1.0, tk.END)
341
+
342
+ def start_parser(self):
343
+ if not self.files:
344
+ messagebox.showinfo("No Files", "Please select at least one file.")
345
+ return
346
+ self.progress_text.config(state=tk.NORMAL)
347
+ self.progress_text.delete(1.0, tk.END)
348
+ self.progress_text.insert(tk.END, "Starting parser...\n")
349
+ self.progress_text.config(state=tk.DISABLED)
350
+ thread = threading.Thread(target=self.run_parser)
351
+ thread.start()
352
+
353
+ def stop_parser(self):
354
+ if self.parser_process and self.parser_process.poll() is None:
355
+ self.parser_process.terminate()
356
+ self.append_progress_text("Parser process was stopped.\n")
357
+ else:
358
+ self.append_progress_text("No active parser process to stop.\n")
359
+
360
+ def run_parser(self):
361
+ try:
362
+ script_path = os.path.abspath(sys.argv[0])
363
+ self.parser_process = subprocess.Popen(
364
+ [sys.executable, script_path] + self.files,
365
+ stdout=subprocess.PIPE,
366
+ stderr=subprocess.STDOUT,
367
+ text=True,
368
+ encoding='utf-8',
369
+ errors='ignore',
370
+ bufsize=4096
371
+ )
372
+ for line in self.parser_process.stdout:
373
+ self.append_progress_text(line)
374
+ self.parser_process.stdout.close()
375
+ self.parser_process.wait()
376
+
377
+ if self.parser_process.returncode == 0:
378
+ self.append_progress_text("\nParser finished successfully.\n")
379
+ self.show_messagebox_threadsafe("Parser Done", "The parser was executed successfully.")
380
+ else:
381
+ self.append_progress_text("\nError while running the parser.\n")
382
+ self.show_messagebox_threadsafe("Error", "Error while running the parser.")
383
+ except Exception as e:
384
+ self.append_progress_text(f"Error: {e}\n")
385
+ self.show_messagebox_threadsafe("Error", f"Error during execution:\n{e}")
386
+ finally:
387
+ self.parser_process = None
388
+
389
+ def append_progress_text(self, text):
390
+ self.progress_text.after(0, lambda: self._insert_text(text))
391
+
392
+ def _insert_text(self, text):
393
+ self.progress_text.config(state=tk.NORMAL)
394
+ self.progress_text.insert(tk.END, text)
395
+ self.progress_text.see(tk.END)
396
+ self.progress_text.config(state=tk.DISABLED)
397
+
398
+ def show_messagebox_threadsafe(self, title, message):
399
+ self.master.after(0, lambda: messagebox.showinfo(title, message))
400
+
401
+ def show_text_file(self, event):
402
+ selection = self.listbox.curselection()
403
+ if not selection:
404
+ return
405
+ index = selection[0]
406
+ path = self.files[index]
407
+ txt_path = os.path.splitext(path)[0] + ".txt"
408
+ self.text_widget.delete(1.0, tk.END)
409
+ if os.path.exists(txt_path):
410
+ try:
411
+ with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
412
+ self.text_widget.insert(tk.END, f.read())
413
+ except Exception as e:
414
+ self.text_widget.insert(tk.END, f"Error loading text file:\n{e}")
415
+ else:
416
+ self.text_widget.insert(tk.END, "[No corresponding .txt file found]")
417
+
418
+ # ========================
419
+ # Entry Point
420
+ # ========================
421
+
422
+ if __name__ == "__main__":
423
+ multiprocessing.freeze_support() # Must be first in main for compatibility with multiprocessing on Windows
424
+
425
+ def is_valid_pdf_arg(arg):
426
+ return arg.lower().endswith(".pdf") and os.path.isfile(arg)
427
+
428
+ def is_worker_process():
429
+ # Bei Multiprocessing-Child-Prozessen erscheinen solche Imports in sys.argv
430
+ return any("resource_tracker" in arg or "loky" in arg for arg in sys.argv)
431
+
432
+ if __name__ == "__main__":
433
+ multiprocessing.freeze_support()
434
+
435
+ if is_worker_process():
436
+ pass # Kein GUI oder PDF-Parsing starten – Loky-Prozess ignorieren
437
+
438
+ elif any(is_valid_pdf_arg(arg) for arg in sys.argv[1:]):
439
+ process_pdfs_main()
440
+
441
+ else:
442
+ root = tk.Tk()
443
+ app = FileManager(root)
444
+ root.mainloop()
445
+
446
+