kalle07 commited on
Commit
b853362
·
verified ·
1 Parent(s): c0373dd

fixed exe error: File "Lib\site-packages\PyInstaller\hooks\rthooks\pyi_rth_multiprocessing.py", line 43, in _freeze_support ValueError: not enough values to unpack (expected 2, got 1)

Browse files
.gitattributes CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  parser_sevenof9_v1_de.exe filter=lfs diff=lfs merge=lfs -text
37
  parser_sevenof9_v1_en.exe filter=lfs diff=lfs merge=lfs -text
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  parser_sevenof9_v1_de.exe filter=lfs diff=lfs merge=lfs -text
37
  parser_sevenof9_v1_en.exe filter=lfs diff=lfs merge=lfs -text
38
+ parser_sevenof9_v1_1_en.exe filter=lfs diff=lfs merge=lfs -text
parser_sevenof9_v1_1_en.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aabef5ef27754c71b86dee6e8cfd83b1d688278cceb383243236548d045e03d1
3
+ size 25576083
parser_sevenof9_v1_1_en.py ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import tkinter as tk # internal
4
+ from tkinter import filedialog, messagebox # internal
5
+ import subprocess
6
+ import threading
7
+ import tempfile
8
+ import shutil
9
+ import json
10
+ import logging
11
+ import pdfplumber
12
+ from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
13
+ from pdfplumber.utils.exceptions import PdfminerException
14
+ from joblib import delayed, cpu_count, parallel_backend, Parallel
15
+ import multiprocessing # intternal
16
+ from multiprocessing import Pool # internal
17
+
18
+
19
+ # ========================
20
+ # Parser Configuration
21
+ # ========================
22
+
23
+ TEXT_EXTRACTION_SETTINGS = {
24
+ "x_tolerance": 1,
25
+ "y_tolerance": 3,
26
+ "keep_blank_chars": False,
27
+ "use_text_flow": True
28
+ }
29
+
30
+ if sys.platform == "win32":
31
+ sys.stderr = open(os.devnull, 'w')
32
+
33
+ PARALLEL_THRESHOLD = 16
34
+
35
+ def suppress_pdfminer_logging():
36
+ for logger_name in [
37
+ "pdfminer",
38
+ "pdfminer.pdfparser",
39
+ "pdfminer.pdfdocument",
40
+ "pdfminer.pdfpage",
41
+ "pdfminer.converter",
42
+ "pdfminer.layout",
43
+ "pdfminer.cmapdb",
44
+ "pdfminer.utils"
45
+ ]:
46
+ logging.getLogger(logger_name).setLevel(logging.ERROR)
47
+
48
+ def clean_cell_text(text):
49
+ if not isinstance(text, str):
50
+ return ""
51
+ text = text.replace("-\n", "").replace("\n", " ")
52
+ return " ".join(text.split())
53
+
54
+ def safe_join(row):
55
+ return [clean_cell_text(str(cell)) if cell is not None else "" for cell in row]
56
+
57
+ def clamp_bbox(bbox, page_width, page_height):
58
+ x0, top, x1, bottom = bbox
59
+ x0 = max(0, min(x0, page_width))
60
+ x1 = max(0, min(x1, page_width))
61
+ top = max(0, min(top, page_height))
62
+ bottom = max(0, min(bottom, page_height))
63
+ return (x0, top, x1, bottom)
64
+
65
+ def process_page(args):
66
+ suppress_pdfminer_logging()
67
+ try:
68
+ page_number, pdf_path, text_settings = args
69
+ with pdfplumber.open(pdf_path) as pdf:
70
+ page = pdf.pages[page_number]
71
+ output = f"Page {page_number + 1}\n"
72
+ width, height = page.width, page.height
73
+
74
+ filtered_page = page
75
+ table_bboxes = []
76
+ table_json_outputs = []
77
+
78
+ for table in page.find_tables():
79
+ bbox = clamp_bbox(table.bbox, width, height)
80
+ table_bboxes.append(bbox)
81
+
82
+ if not page.crop(bbox).chars:
83
+ continue
84
+
85
+ filtered_page = filtered_page.filter(
86
+ lambda obj: get_bbox_overlap(obj_to_bbox(obj), bbox) is None
87
+ )
88
+
89
+ table_data = table.extract()
90
+ if table_data and len(table_data) >= 1:
91
+ headers = safe_join(table_data[0])
92
+ rows = [safe_join(row) for row in table_data[1:]]
93
+ json_table = [dict(zip(headers, row)) for row in rows]
94
+ table_json_outputs.append(json.dumps(json_table, indent=1, ensure_ascii=False))
95
+
96
+ words_outside_tables = [
97
+ word for word in page.extract_words(**text_settings)
98
+ if not any(
99
+ bbox[0] <= float(word['x0']) <= bbox[2] and
100
+ bbox[1] <= float(word['top']) <= bbox[3]
101
+ for bbox in table_bboxes
102
+ )
103
+ ]
104
+
105
+ current_y = None
106
+ line = []
107
+ text_content = ""
108
+
109
+ for word in words_outside_tables:
110
+ if current_y is None or abs(word['top'] - current_y) > 10:
111
+ if line:
112
+ text_content += " ".join(line) + "\n"
113
+ line = [word['text']]
114
+ current_y = word['top']
115
+ else:
116
+ line.append(word['text'])
117
+ if line:
118
+ text_content += " ".join(line) + "\n"
119
+
120
+ output += text_content.strip() + "\n"
121
+
122
+ for idx, table in enumerate(table_json_outputs, start=1):
123
+ output += f'"table {idx}":\n{table}\n'
124
+
125
+ return page_number, output
126
+
127
+ except Exception as e:
128
+ return args[0], f"[ERROR] Page {args[0]+1} ({args[1]}): {str(e)}"
129
+
130
+ def process_pdf(pdf_path):
131
+ suppress_pdfminer_logging()
132
+ try:
133
+ if not os.path.exists(pdf_path):
134
+ return f"[ERROR] File not found: {pdf_path}"
135
+
136
+ print(f"[INFO] Starting processing: {pdf_path}")
137
+ try:
138
+ with pdfplumber.open(pdf_path) as pdf:
139
+ num_pages = len(pdf.pages)
140
+ except PdfminerException as e:
141
+ return f"[ERROR] Cannot open PDF: {pdf_path} – {str(e)}"
142
+ except Exception as e:
143
+ return f"[ERROR] General error opening PDF: {pdf_path} – {str(e)}"
144
+
145
+ pages = [(i, pdf_path, TEXT_EXTRACTION_SETTINGS) for i in range(num_pages)]
146
+
147
+ try:
148
+ results = run_serial(pages) if num_pages <= PARALLEL_THRESHOLD else run_parallel(pages)
149
+ except (EOFError, BrokenPipeError, KeyboardInterrupt):
150
+ return "[INFO] Processing was interrupted."
151
+
152
+ sorted_results = sorted(results, key=lambda x: x[0])
153
+ final_output = "\n".join(text for _, text in sorted_results)
154
+
155
+ base_name = os.path.splitext(os.path.basename(pdf_path))[0]
156
+ output_dir = os.path.dirname(pdf_path)
157
+ output_path = os.path.join(output_dir, f"{base_name}.txt")
158
+
159
+ with open(output_path, "w", encoding="utf-8", errors="ignore") as f:
160
+ f.write(final_output)
161
+
162
+ print(f"[INFO] Processing complete: {output_path}")
163
+
164
+ except (EOFError, BrokenPipeError, KeyboardInterrupt):
165
+ return "[INFO] Processing interrupted by user."
166
+ except Exception as e:
167
+ return f"[ERROR] Unexpected error with '{pdf_path}': {str(e)}"
168
+
169
+ def run_serial(pages):
170
+ return [process_page(args) for args in pages]
171
+
172
+ def run_parallel(pages):
173
+ available_cores = max(1, cpu_count() - 2)
174
+ num_cores = min(available_cores, len(pages))
175
+ print(f"Starting parallel processing with {num_cores} cores...")
176
+ with Pool(processes=num_cores) as pool:
177
+ return pool.map(process_page, pages)
178
+
179
+ def process_pdfs_main():
180
+ suppress_pdfminer_logging()
181
+ pdf_files = sys.argv[1:]
182
+ if not pdf_files:
183
+ print("No PDF files provided.")
184
+ return
185
+
186
+ small_pdfs = []
187
+ large_pdfs = []
188
+
189
+ for path in pdf_files:
190
+ if not os.path.exists(path):
191
+ print(f"File not found: {path}")
192
+ continue
193
+ try:
194
+ with pdfplumber.open(path) as pdf:
195
+ if len(pdf.pages) <= PARALLEL_THRESHOLD:
196
+ small_pdfs.append(path)
197
+ else:
198
+ large_pdfs.append(path)
199
+ except PdfminerException:
200
+ print(f"[ERROR] Password-protected PDF skipped: {path}")
201
+ except Exception as e:
202
+ print(f"[ERROR] Error opening {path}: {str(e)}")
203
+
204
+ if small_pdfs:
205
+ available_cores = max(1, cpu_count() - 2)
206
+ num_cores = min(available_cores, len(small_pdfs))
207
+ print(f"\n[Phase 1] Starting parallel processing of small PDFs with {num_cores} cores...")
208
+ results = Parallel(n_jobs=num_cores)(
209
+ delayed(process_pdf)(path) for path in small_pdfs
210
+ )
211
+ for r in results:
212
+ print(r)
213
+
214
+ for path in large_pdfs:
215
+ print(f"\n[Phase 2] Processing large PDF: {os.path.basename(path)}")
216
+ print(process_pdf(path))
217
+
218
+
219
+ # ========================
220
+ # GUI Class
221
+ # ========================
222
+
223
+ class FileManager:
224
+ def __init__(self, master):
225
+ self.master = master
226
+ self.master.title("Parser-Sevenof9")
227
+ self.files = []
228
+ self.last_selected_index = None
229
+
230
+ self.label = tk.Label(master, text="Selected PDF files:")
231
+ self.label.pack(pady=5)
232
+
233
+ listbox_frame = tk.Frame(master)
234
+ listbox_frame.pack(pady=5)
235
+
236
+ scrollbar_listbox = tk.Scrollbar(listbox_frame)
237
+ self.listbox = tk.Listbox(listbox_frame, selectmode=tk.MULTIPLE, width=80, height=6, yscrollcommand=scrollbar_listbox.set)
238
+ scrollbar_listbox.config(command=self.listbox.yview)
239
+
240
+ self.listbox.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
241
+ scrollbar_listbox.pack(side=tk.RIGHT, fill=tk.Y)
242
+
243
+ self.listbox.bind("<<ListboxSelect>>", self.show_text_file)
244
+ self.listbox.bind("<Button-1>", self.on_listbox_click)
245
+ self.listbox.bind("<Shift-Button-1>", self.on_listbox_shift_click)
246
+
247
+ self.context_menu = tk.Menu(master, tearoff=0)
248
+ self.context_menu.add_command(label="Remove selected", command=self.remove_file)
249
+ self.listbox.bind("<Button-3>", self.show_context_menu)
250
+
251
+ self.frame = tk.Frame(master)
252
+ self.frame.pack(pady=10)
253
+
254
+ tk.Button(self.frame, text="Add Folder", command=self.add_folder).pack(side=tk.LEFT, padx=5)
255
+ tk.Button(self.frame, text="Select Files", command=self.add_file).pack(side=tk.LEFT, padx=5)
256
+ tk.Button(self.frame, text="Remove Selected", command=self.remove_file).pack(side=tk.LEFT, padx=5)
257
+ tk.Button(self.frame, text="Remove All", command=self.remove_all).pack(side=tk.LEFT, padx=5)
258
+ tk.Button(master, text="Stop", command=self.stop_parser).pack(pady=5)
259
+ self.parser_process = None # Will be stored in thread
260
+
261
+ tk.Button(master, text="Start Parser", command=self.start_parser).pack(pady=10)
262
+
263
+ text_frame = tk.Frame(master)
264
+ text_frame.pack(padx=10, pady=5)
265
+
266
+ scrollbar_text = tk.Scrollbar(text_frame)
267
+ self.text_widget = tk.Text(text_frame, height=15, width=100, wrap=tk.WORD, yscrollcommand=scrollbar_text.set)
268
+ scrollbar_text.config(command=self.text_widget.yview)
269
+
270
+ self.text_widget.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
271
+ scrollbar_text.pack(side=tk.RIGHT, fill=tk.Y)
272
+
273
+ tk.Label(master, text="Progress:").pack()
274
+
275
+ progress_frame = tk.Frame(master)
276
+ progress_frame.pack(padx=10, pady=5)
277
+
278
+ scrollbar_progress = tk.Scrollbar(progress_frame)
279
+ self.progress_text = tk.Text(progress_frame, height=8, width=100, state=tk.DISABLED, yscrollcommand=scrollbar_progress.set)
280
+ scrollbar_progress.config(command=self.progress_text.yview)
281
+
282
+ self.progress_text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
283
+ scrollbar_progress.pack(side=tk.RIGHT, fill=tk.Y)
284
+
285
+ def on_listbox_click(self, event):
286
+ index = self.listbox.nearest(event.y)
287
+ self.listbox.selection_clear(0, tk.END)
288
+ self.listbox.selection_set(index)
289
+ self.last_selected_index = index
290
+ self.show_text_file(None)
291
+ return "break"
292
+
293
+ def on_listbox_shift_click(self, event):
294
+ index = self.listbox.nearest(event.y)
295
+ if self.last_selected_index is None:
296
+ self.last_selected_index = index
297
+ start, end = sorted((self.last_selected_index, index))
298
+ self.listbox.selection_clear(0, tk.END)
299
+ for i in range(start, end + 1):
300
+ self.listbox.selection_set(i)
301
+ return "break"
302
+
303
+ def show_context_menu(self, event):
304
+ if self.listbox.curselection():
305
+ self.context_menu.tk_popup(event.x_root, event.y_root)
306
+
307
+ def add_folder(self):
308
+ folder = filedialog.askdirectory(title="Select Folder")
309
+ if not folder:
310
+ return
311
+ for root, _, files in os.walk(folder):
312
+ for file in files:
313
+ if file.lower().endswith(".pdf"):
314
+ path = os.path.join(root, file)
315
+ if path not in self.files:
316
+ self.files.append(path)
317
+ self.listbox.insert(tk.END, path)
318
+
319
+ def add_file(self):
320
+ paths = filedialog.askopenfilenames(title="Select PDF Files", filetypes=[("PDF Files", "*.pdf")])
321
+ for path in paths:
322
+ if path not in self.files:
323
+ self.files.append(path)
324
+ self.listbox.insert(tk.END, path)
325
+
326
+ def remove_file(self):
327
+ selection = self.listbox.curselection()
328
+ if not selection:
329
+ messagebox.showwarning("Notice", "Please select an entry to remove.")
330
+ return
331
+ for index in reversed(selection):
332
+ self.listbox.delete(index)
333
+ del self.files[index]
334
+ self.text_widget.delete(1.0, tk.END)
335
+
336
+ def remove_all(self):
337
+ self.listbox.delete(0, tk.END)
338
+ self.files.clear()
339
+ self.text_widget.delete(1.0, tk.END)
340
+
341
+ def start_parser(self):
342
+ if not self.files:
343
+ messagebox.showinfo("No Files", "Please select at least one file.")
344
+ return
345
+ self.progress_text.config(state=tk.NORMAL)
346
+ self.progress_text.delete(1.0, tk.END)
347
+ self.progress_text.insert(tk.END, "Starting parser...\n")
348
+ self.progress_text.config(state=tk.DISABLED)
349
+ thread = threading.Thread(target=self.run_parser)
350
+ thread.start()
351
+
352
+ def stop_parser(self):
353
+ if self.parser_process and self.parser_process.poll() is None:
354
+ self.parser_process.terminate()
355
+ self.append_progress_text("Parser process was stopped.\n")
356
+ else:
357
+ self.append_progress_text("No active parser process to stop.\n")
358
+
359
+ def run_parser(self):
360
+ try:
361
+ script_path = os.path.abspath(sys.argv[0])
362
+ self.parser_process = subprocess.Popen(
363
+ [sys.executable, script_path] + self.files,
364
+ stdout=subprocess.PIPE,
365
+ stderr=subprocess.STDOUT,
366
+ text=True,
367
+ encoding='utf-8',
368
+ errors='ignore',
369
+ bufsize=4096
370
+ )
371
+ for line in self.parser_process.stdout:
372
+ self.append_progress_text(line)
373
+ self.parser_process.stdout.close()
374
+ self.parser_process.wait()
375
+
376
+ if self.parser_process.returncode == 0:
377
+ self.append_progress_text("\nParser finished successfully.\n")
378
+ self.show_messagebox_threadsafe("Parser Done", "The parser was executed successfully.")
379
+ else:
380
+ self.append_progress_text("\nError while running the parser.\n")
381
+ self.show_messagebox_threadsafe("Error", "Error while running the parser.")
382
+ except Exception as e:
383
+ self.append_progress_text(f"Error: {e}\n")
384
+ self.show_messagebox_threadsafe("Error", f"Error during execution:\n{e}")
385
+ finally:
386
+ self.parser_process = None
387
+
388
+ def append_progress_text(self, text):
389
+ self.progress_text.after(0, lambda: self._insert_text(text))
390
+
391
+ def _insert_text(self, text):
392
+ self.progress_text.config(state=tk.NORMAL)
393
+ self.progress_text.insert(tk.END, text)
394
+ self.progress_text.see(tk.END)
395
+ self.progress_text.config(state=tk.DISABLED)
396
+
397
+ def show_messagebox_threadsafe(self, title, message):
398
+ self.master.after(0, lambda: messagebox.showinfo(title, message))
399
+
400
+ def show_text_file(self, event):
401
+ selection = self.listbox.curselection()
402
+ if not selection:
403
+ return
404
+ index = selection[0]
405
+ path = self.files[index]
406
+ txt_path = os.path.splitext(path)[0] + ".txt"
407
+ self.text_widget.delete(1.0, tk.END)
408
+ if os.path.exists(txt_path):
409
+ try:
410
+ with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
411
+ self.text_widget.insert(tk.END, f.read())
412
+ except Exception as e:
413
+ self.text_widget.insert(tk.END, f"Error loading text file:\n{e}")
414
+ else:
415
+ self.text_widget.insert(tk.END, "[No corresponding .txt file found]")
416
+
417
+ # ========================
418
+ # Entry Point
419
+ # ========================
420
+
421
+ if __name__ == "__main__":
422
+ multiprocessing.freeze_support() # Must be first in main for compatibility with multiprocessing on Windows
423
+
424
+ if len(sys.argv) > 1:
425
+ process_pdfs_main()
426
+ else:
427
+ root = tk.Tk()
428
+ app = FileManager(root)
429
+ root.mainloop()
430
+