kalle07 commited on
Commit
3829982
·
verified ·
1 Parent(s): 1ec7a25

only commented out much more

Browse files
Files changed (1) hide show
  1. parser_sevenof9_v2_en.py +490 -0
parser_sevenof9_v2_en.py ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os # OS module for interacting with the operating system (file management, etc.)
2
+ import sys # Provides access to system-specific parameters and functions
3
+ import tkinter as tk # GUI module for creating desktop applications
4
+ from tkinter import filedialog, messagebox # Additional tkinter components for file dialogs and message boxes
5
+ import subprocess # Module to run system commands
6
+ import threading # Threading module to run tasks concurrently
7
+ import tempfile # Module to create temporary files and directories
8
+ import shutil # Module for file operations like copy, move, and delete
9
+ import json # JSON module for working with JSON data
10
+ import logging # Logging module for tracking events and errors
11
+ import pdfplumber # Library for extracting text and tables from PDFs
12
+ from pdfplumber.utils import get_bbox_overlap, obj_to_bbox # Helper functions from pdfplumber for working with bounding boxes
13
+ from pdfplumber.utils.exceptions import PdfminerException # Exception related to PDF processing
14
+ from joblib import delayed, cpu_count, parallel_backend, Parallel # Joblib for parallel processing and optimization
15
+ import multiprocessing # Module for parallel processing using multiple CPU cores
16
+ from multiprocessing import Pool # Pool class for parallelizing tasks across multiple processes
17
+
18
+
19
+ # ========================
20
+ # Parser Configuration
21
+ # ========================
22
+
23
+ TEXT_EXTRACTION_SETTINGS = {
24
+ "x_tolerance": 1, # Horizontal tolerance for text extraction
25
+ "y_tolerance": 3, # Vertical tolerance for text extraction
26
+ "keep_blank_chars": False, # Option to retain blank characters in the extracted text
27
+ "use_text_flow": True # Option to use text flow for better structure
28
+ }
29
+
30
+ # Suppress stderr output on Windows platform to avoid cluttering the console
31
+ if sys.platform == "win32":
32
+ sys.stderr = open(os.devnull, 'w') # Redirect stderr to null
33
+
34
+ PARALLEL_THRESHOLD = 16 # Number of pages to use for deciding between serial or parallel processing
35
+
36
+ # Function to suppress PDFMiner logging, reducing verbosity
37
+ def suppress_pdfminer_logging():
38
+ for logger_name in [
39
+ "pdfminer", # Various pdfminer modules to suppress logging from
40
+ "pdfminer.pdfparser",
41
+ "pdfminer.pdfdocument",
42
+ "pdfminer.pdfpage",
43
+ "pdfminer.converter",
44
+ "pdfminer.layout",
45
+ "pdfminer.cmapdb",
46
+ "pdfminer.utils"
47
+ ]:
48
+ logging.getLogger(logger_name).setLevel(logging.ERROR) # Set logging level to ERROR to suppress lower levels
49
+
50
+ # Function to clean up text by removing unwanted hyphenations and newlines
51
+ def clean_cell_text(text):
52
+ if not isinstance(text, str): # If text is not a string, return empty string
53
+ return ""
54
+ text = text.replace("-\n", "").replace("\n", " ") # Remove hyphenated line breaks and replace newlines with space
55
+ return " ".join(text.split()) # Split text into words and join with single spaces
56
+
57
+ # Function to safely clean and join row cell data
58
+ def safe_join(row):
59
+ return [clean_cell_text(str(cell)) if cell is not None else "" for cell in row] # Clean each cell in the row, or return empty if None
60
+
61
+ # Function to clamp bounding box coordinates within page boundaries
62
+ def clamp_bbox(bbox, page_width, page_height):
63
+ x0, top, x1, bottom = bbox # Extract bounding box coordinates
64
+ # Ensure each coordinate is within the page width and height limits
65
+ x0 = max(0, min(x0, page_width))
66
+ x1 = max(0, min(x1, page_width))
67
+ top = max(0, min(top, page_height))
68
+ bottom = max(0, min(bottom, page_height))
69
+ return (x0, top, x1, bottom) # Return the adjusted bounding box
70
+
71
+ # Function to process a single PDF page
72
+ def process_page(args):
73
+ suppress_pdfminer_logging() # Suppress unnecessary PDFMiner logging
74
+ try:
75
+ page_number, pdf_path, text_settings = args # Extract page number, PDF path, and text extraction settings
76
+ with pdfplumber.open(pdf_path) as pdf: # Open the PDF using pdfplumber
77
+ page = pdf.pages[page_number] # Get the specific page
78
+ output = f"Page {page_number + 1}\n" # Add page number to the output
79
+ width, height = page.width, page.height # Get page dimensions
80
+
81
+ filtered_page = page # Initialize filtered page
82
+ table_bboxes = [] # List to hold bounding boxes of tables
83
+ table_json_outputs = [] # List to hold JSON output of tables
84
+
85
+ # Iterate through all tables found on the page
86
+ for table in page.find_tables():
87
+ bbox = clamp_bbox(table.bbox, width, height) # Adjust the bounding box to fit within the page
88
+ table_bboxes.append(bbox) # Add the bounding box to the list
89
+
90
+ if not page.crop(bbox).chars: # Skip tables that have no characters
91
+ continue
92
+
93
+ # Filter out any elements that overlap with the table's bounding box
94
+ filtered_page = filtered_page.filter(
95
+ lambda obj: get_bbox_overlap(obj_to_bbox(obj), bbox) is None
96
+ )
97
+
98
+ # Extract the table data and structure it
99
+ table_data = table.extract()
100
+ if table_data and len(table_data) >= 1: # Ensure there is data in the table
101
+ headers = safe_join(table_data[0]) # Clean and join the headers
102
+ rows = [safe_join(row) for row in table_data[1:]] # Clean and join the table rows
103
+ json_table = [dict(zip(headers, row)) for row in rows] # Create a JSON object from headers and rows
104
+ table_json_outputs.append(json.dumps(json_table, indent=1, ensure_ascii=False)) # Convert table data to JSON
105
+
106
+ # Extract words outside the tables
107
+ words_outside_tables = [
108
+ word for word in page.extract_words(**text_settings) # Extract words from the page using the settings
109
+ if not any(
110
+ bbox[0] <= float(word['x0']) <= bbox[2] and
111
+ bbox[1] <= float(word['top']) <= bbox[3]
112
+ for bbox in table_bboxes # Ensure word is not inside any table bounding box
113
+ )
114
+ ]
115
+
116
+ current_y = None # Track vertical position of words
117
+ line = [] # List to hold words for the current line
118
+ text_content = "" # Store the extracted text content
119
+
120
+ # Iterate through words and group them into lines
121
+ for word in words_outside_tables:
122
+ if current_y is None or abs(word['top'] - current_y) > 10: # Start a new line if Y position changes significantly
123
+ if line: # If there's a previous line, join and add it to text content
124
+ text_content += " ".join(line) + "\n"
125
+ line = [word['text']] # Start a new line with the current word
126
+ current_y = word['top'] # Update the current Y position
127
+ else:
128
+ line.append(word['text']) # Append the word to the current line
129
+ if line: # Add the last line to the text content
130
+ text_content += " ".join(line) + "\n"
131
+
132
+ output += text_content.strip() + "\n" # Add the final text content for the page
133
+
134
+ # Add table JSON outputs to the page output
135
+ for idx, table in enumerate(table_json_outputs, start=1):
136
+ output += f'"table {idx}":\n{table}\n'
137
+
138
+ return page_number, output # Return the processed page number and output content
139
+
140
+ except Exception as e:
141
+ return args[0], f"[ERROR] Page {args[0]+1} ({args[1]}): {str(e)}" # Return an error message if an exception occurs
142
+
143
+ # Function to process the entire PDF document
144
+ def process_pdf(pdf_path):
145
+ suppress_pdfminer_logging() # Suppress unnecessary logging
146
+ try:
147
+ if not os.path.exists(pdf_path): # Check if the file exists
148
+ return f"[ERROR] File not found: {pdf_path}" # Return error message if file does not exist
149
+
150
+ print(f"[INFO] Starting processing: {pdf_path}") # Log the start of processing
151
+ try:
152
+ with pdfplumber.open(pdf_path) as pdf: # Open the PDF using pdfplumber
153
+ num_pages = len(pdf.pages) # Get the number of pages in the PDF
154
+ except PdfminerException as e:
155
+ return f"[ERROR] Cannot open PDF: {pdf_path} – {str(e)}" # Return error if the PDF cannot be opened
156
+ except Exception as e:
157
+ return f"[ERROR] General error opening PDF: {pdf_path} – {str(e)}" # Return general error if any exception occurs
158
+
159
+ pages = [(i, pdf_path, TEXT_EXTRACTION_SETTINGS) for i in range(num_pages)] # Prepare the pages for processing
160
+
161
+ try:
162
+ results = run_serial(pages) if num_pages <= PARALLEL_THRESHOLD else run_parallel(pages) # Run serial or parallel processing
163
+ except (EOFError, BrokenPipeError, KeyboardInterrupt):
164
+ return "[INFO] Processing was interrupted." # Handle interruptions during processing
165
+
166
+ sorted_results = sorted(results, key=lambda x: x[0]) # Sort results by page number
167
+ final_output = "\n".join(text for _, text in sorted_results) # Combine all page results into a single string
168
+
169
+ base_name = os.path.splitext(os.path.basename(pdf_path))[0] # Get the base name of the PDF file
170
+ output_dir = os.path.dirname(pdf_path) # Get the directory of the PDF file
171
+ output_path = os.path.join(output_dir, f"{base_name}.txt") # Generate the output file path
172
+
173
+ with open(output_path, "w", encoding="utf-8", errors="ignore") as f: # Open the output file for writing
174
+ f.write(final_output) # Write the final output to the file
175
+
176
+ print(f"[INFO] Processing complete: {output_path}") # Log the successful processing completion
177
+
178
+ except (EOFError, BrokenPipeError, KeyboardInterrupt):
179
+ return "[INFO] Processing interrupted by user." # Handle user interruptions
180
+ except Exception as e:
181
+ return f"[ERROR] Unexpected error with '{pdf_path}': {str(e)}" # Handle unexpected errors during processing
182
+
183
+ # Function to run the PDF processing serially (one page at a time)
184
+ def run_serial(pages):
185
+ return [process_page(args) for args in pages] # Process each page in sequence
186
+
187
+ # Function to run the PDF processing in parallel (across multiple cores)
188
+ def run_parallel(pages):
189
+ available_cores = max(1, cpu_count() - 2) # Calculate the number of available CPU cores, leaving 2 for system processes
190
+ num_cores = min(available_cores, len(pages)) # Limit the number of cores based on the number of pages
191
+ print(f"Starting parallel processing with {num_cores} cores...") # Log the number of cores used
192
+ with Pool(processes=num_cores) as pool: # Create a pool of processes
193
+ return pool.map(process_page, pages) # Distribute the page processing across the available cores
194
+
195
+ # Main function to process a list of PDFs
196
+ def process_pdfs_main():
197
+ suppress_pdfminer_logging() # Suppress unnecessary logging
198
+ pdf_files = sys.argv[1:] # Get PDF file paths from command-line arguments
199
+ if not pdf_files: # Check if any PDFs are provided
200
+ print("No PDF files provided.") # Log message if no PDFs are provided
201
+ return
202
+
203
+ small_pdfs = [] # List to store small PDFs (less than the parallel threshold)
204
+ large_pdfs = [] # List to store large PDFs (greater than the parallel threshold)
205
+
206
+ # Categorize PDFs into small and large based on the number of pages
207
+ for path in pdf_files:
208
+ if not os.path.exists(path): # Check if the file exists
209
+ print(f"File not found: {path}") # Log error if file does not exist
210
+ continue
211
+ try:
212
+ with pdfplumber.open(path) as pdf: # Open the PDF
213
+ if len(pdf.pages) <= PARALLEL_THRESHOLD: # If the PDF has fewer pages than the threshold
214
+ small_pdfs.append(path) # Add to small PDFs list
215
+ else:
216
+ large_pdfs.append(path) # Add to large PDFs list
217
+ except PdfminerException:
218
+ print(f"[ERROR] Password-protected PDF skipped: {path}") # Log if the PDF is password-protected
219
+ except Exception as e:
220
+ print(f"[ERROR] Error opening {path}: {str(e)}") # Log any other errors when opening the PDF
221
+
222
+ # Process small PDFs in parallel (if there are any)
223
+ if small_pdfs:
224
+ available_cores = max(1, cpu_count() - 2) # Determine the number of available cores
225
+ num_cores = min(available_cores, len(small_pdfs)) # Use the lesser of available cores or small PDFs count
226
+ print(f"\n[Phase 1] Starting parallel processing of small PDFs with {num_cores} cores...") # Log processing start
227
+ results = Parallel(n_jobs=num_cores)( # Run parallel processing for small PDFs
228
+ delayed(process_pdf)(path) for path in small_pdfs
229
+ )
230
+ for r in results:
231
+ print(r) # Print the results for each small PDF
232
+
233
+ # Process large PDFs one by one (in serial)
234
+ for path in large_pdfs:
235
+ print(f"\n[Phase 2] Processing large PDF: {os.path.basename(path)}") # Log processing of large PDF
236
+ print(process_pdf(path)) # Process the large PDF
237
+
238
+
239
+ # GUI
240
+
241
+ class FileManager:
242
+ def __init__(self, master):
243
+ # Initialize the main window and title
244
+ self.master = master
245
+ self.master.title("Parser-Sevenof9")
246
+
247
+ # Internal list to track selected PDF files
248
+ self.files = []
249
+ self.last_selected_index = None # Stores the last clicked index for shift-selection
250
+
251
+ # Label for file list
252
+ self.label = tk.Label(master, text="Selected PDF files:")
253
+ self.label.pack(pady=5)
254
+
255
+ # Frame to contain the listbox and its scrollbar
256
+ listbox_frame = tk.Frame(master)
257
+ listbox_frame.pack(pady=5)
258
+
259
+ # Scrollbar for the listbox
260
+ scrollbar_listbox = tk.Scrollbar(listbox_frame)
261
+ self.listbox = tk.Listbox(
262
+ listbox_frame, selectmode=tk.MULTIPLE, width=80, height=6,
263
+ yscrollcommand=scrollbar_listbox.set
264
+ )
265
+ scrollbar_listbox.config(command=self.listbox.yview)
266
+
267
+ # Pack listbox and scrollbar side by side
268
+ self.listbox.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
269
+ scrollbar_listbox.pack(side=tk.RIGHT, fill=tk.Y)
270
+
271
+ # Bind selection and click events for the listbox
272
+ self.listbox.bind("<<ListboxSelect>>", self.show_text_file)
273
+ self.listbox.bind("<Button-1>", self.on_listbox_click)
274
+ self.listbox.bind("<Shift-Button-1>", self.on_listbox_shift_click)
275
+
276
+ # Create a context menu for right-click actions
277
+ self.context_menu = tk.Menu(master, tearoff=0)
278
+ self.context_menu.add_command(label="Remove selected", command=self.remove_file)
279
+ self.listbox.bind("<Button-3>", self.show_context_menu)
280
+
281
+ # Frame for action buttons (Add/Remove)
282
+ self.frame = tk.Frame(master)
283
+ self.frame.pack(pady=10)
284
+
285
+ # Action buttons
286
+ tk.Button(self.frame, text="Add Folder", command=self.add_folder).pack(side=tk.LEFT, padx=5)
287
+ tk.Button(self.frame, text="Select Files", command=self.add_file).pack(side=tk.LEFT, padx=5)
288
+ tk.Button(self.frame, text="Remove Selected", command=self.remove_file).pack(side=tk.LEFT, padx=5)
289
+ tk.Button(self.frame, text="Remove All", command=self.remove_all).pack(side=tk.LEFT, padx=5)
290
+ tk.Button(master, text="Stop", command=self.stop_parser).pack(pady=5)
291
+
292
+ # Placeholder for the parser process (used in threading)
293
+ self.parser_process = None
294
+
295
+ # Start button for parsing process
296
+ tk.Button(master, text="Start Parser", command=self.start_parser).pack(pady=10)
297
+
298
+ # Text frame to display the contents of the selected .txt file
299
+ text_frame = tk.Frame(master)
300
+ text_frame.pack(padx=10, pady=5)
301
+
302
+ scrollbar_text = tk.Scrollbar(text_frame)
303
+ self.text_widget = tk.Text(
304
+ text_frame, height=15, width=100, wrap=tk.WORD,
305
+ yscrollcommand=scrollbar_text.set
306
+ )
307
+ scrollbar_text.config(command=self.text_widget.yview)
308
+
309
+ # Pack text viewer and scrollbar
310
+ self.text_widget.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
311
+ scrollbar_text.pack(side=tk.RIGHT, fill=tk.Y)
312
+
313
+ # Label for progress section
314
+ tk.Label(master, text="Progress:").pack()
315
+
316
+ # Frame for progress output
317
+ progress_frame = tk.Frame(master)
318
+ progress_frame.pack(padx=10, pady=5)
319
+
320
+ scrollbar_progress = tk.Scrollbar(progress_frame)
321
+ self.progress_text = tk.Text(
322
+ progress_frame, height=8, width=100, state=tk.DISABLED,
323
+ yscrollcommand=scrollbar_progress.set
324
+ )
325
+ scrollbar_progress.config(command=self.progress_text.yview)
326
+
327
+ self.progress_text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
328
+ scrollbar_progress.pack(side=tk.RIGHT, fill=tk.Y)
329
+
330
+ def on_listbox_click(self, event):
331
+ # Handle single left-click selection; clear previous selection
332
+ index = self.listbox.nearest(event.y)
333
+ self.listbox.selection_clear(0, tk.END)
334
+ self.listbox.selection_set(index)
335
+ self.last_selected_index = index
336
+ self.show_text_file(None)
337
+ return "break" # Prevent default event propagation
338
+
339
+ def on_listbox_shift_click(self, event):
340
+ # Handle shift-click for range selection
341
+ index = self.listbox.nearest(event.y)
342
+ if self.last_selected_index is None:
343
+ self.last_selected_index = index
344
+ start, end = sorted((self.last_selected_index, index))
345
+ self.listbox.selection_clear(0, tk.END)
346
+ for i in range(start, end + 1):
347
+ self.listbox.selection_set(i)
348
+ return "break"
349
+
350
+ def show_context_menu(self, event):
351
+ # Show right-click context menu if any item is selected
352
+ if self.listbox.curselection():
353
+ self.context_menu.tk_popup(event.x_root, event.y_root)
354
+
355
+ def add_folder(self):
356
+ # Add all PDFs from a selected folder
357
+ folder = filedialog.askdirectory(title="Select Folder")
358
+ if not folder:
359
+ return
360
+ for root, _, files in os.walk(folder):
361
+ for file in files:
362
+ if file.lower().endswith(".pdf"):
363
+ path = os.path.join(root, file)
364
+ if path not in self.files:
365
+ self.files.append(path)
366
+ self.listbox.insert(tk.END, path)
367
+
368
+ def add_file(self):
369
+ # Add selected individual PDF files
370
+ paths = filedialog.askopenfilenames(title="Select PDF Files", filetypes=[("PDF Files", "*.pdf")])
371
+ for path in paths:
372
+ if path not in self.files:
373
+ self.files.append(path)
374
+ self.listbox.insert(tk.END, path)
375
+
376
+ def remove_file(self):
377
+ # Remove selected files from list and internal storage
378
+ selection = self.listbox.curselection()
379
+ if not selection:
380
+ messagebox.showwarning("Notice", "Please select an entry to remove.")
381
+ return
382
+ for index in reversed(selection): # Reverse to avoid index shifting
383
+ self.listbox.delete(index)
384
+ del self.files[index]
385
+ self.text_widget.delete(1.0, tk.END)
386
+
387
+ def remove_all(self):
388
+ # Remove all files from the list
389
+ self.listbox.delete(0, tk.END)
390
+ self.files.clear()
391
+ self.text_widget.delete(1.0, tk.END)
392
+
393
+ def start_parser(self):
394
+ # Validate input and launch parser in separate thread
395
+ if not self.files:
396
+ messagebox.showinfo("No Files", "Please select at least one file.")
397
+ return
398
+ self.progress_text.config(state=tk.NORMAL)
399
+ self.progress_text.delete(1.0, tk.END)
400
+ self.progress_text.insert(tk.END, "Starting parser...\n")
401
+ self.progress_text.config(state=tk.DISABLED)
402
+
403
+ # Launch parsing in background to avoid UI freeze
404
+ thread = threading.Thread(target=self.run_parser)
405
+ thread.start()
406
+
407
+ def stop_parser(self):
408
+ # Terminate running parser process if active
409
+ if self.parser_process and self.parser_process.poll() is None:
410
+ self.parser_process.terminate()
411
+ self.append_progress_text("Parser process was stopped.\n")
412
+ else:
413
+ self.append_progress_text("No active parser process to stop.\n")
414
+
415
+ def run_parser(self):
416
+ # Internal method to run the external parser script
417
+ try:
418
+ self.parser_process = subprocess.Popen(
419
+ [sys.executable, __file__] + self.files,
420
+ stdout=subprocess.PIPE,
421
+ stderr=subprocess.STDOUT,
422
+ text=True,
423
+ encoding='utf-8',
424
+ errors='ignore',
425
+ bufsize=4096
426
+ )
427
+ for line in self.parser_process.stdout:
428
+ self.append_progress_text(line)
429
+ self.parser_process.stdout.close()
430
+ self.parser_process.wait()
431
+
432
+ if self.parser_process.returncode == 0:
433
+ self.append_progress_text("\nParser finished successfully.\n")
434
+ self.show_messagebox_threadsafe("Parser Done", "The parser was executed successfully.")
435
+ else:
436
+ self.append_progress_text("\nError while running the parser.\n")
437
+ self.show_messagebox_threadsafe("Error", "Error while running the parser.")
438
+ except Exception as e:
439
+ self.append_progress_text(f"Error: {e}\n")
440
+ self.show_messagebox_threadsafe("Error", f"Error during execution:\n{e}")
441
+ finally:
442
+ self.parser_process = None
443
+
444
+ def append_progress_text(self, text):
445
+ # Thread-safe method to append text to the progress view
446
+ self.progress_text.after(0, lambda: self._insert_text(text))
447
+
448
+ def _insert_text(self, text):
449
+ # Append text and scroll to bottom
450
+ self.progress_text.config(state=tk.NORMAL)
451
+ self.progress_text.insert(tk.END, text)
452
+ self.progress_text.see(tk.END)
453
+ self.progress_text.config(state=tk.DISABLED)
454
+
455
+ def show_messagebox_threadsafe(self, title, message):
456
+ # Display a messagebox from a background thread
457
+ self.master.after(0, lambda: messagebox.showinfo(title, message))
458
+
459
+ def show_text_file(self, event):
460
+ # Load and show the content of the corresponding .txt file (if available)
461
+ selection = self.listbox.curselection()
462
+ if not selection:
463
+ return
464
+ index = selection[0]
465
+ path = self.files[index]
466
+ txt_path = os.path.splitext(path)[0] + ".txt"
467
+ self.text_widget.delete(1.0, tk.END)
468
+ if os.path.exists(txt_path):
469
+ try:
470
+ with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
471
+ self.text_widget.insert(tk.END, f.read())
472
+ except Exception as e:
473
+ self.text_widget.insert(tk.END, f"Error loading text file:\n{e}")
474
+ else:
475
+ self.text_widget.insert(tk.END, "[No corresponding .txt file found]")
476
+
477
+
478
+ # MAIN
479
+
480
+ if __name__ == "__main__":
481
+ multiprocessing.freeze_support() # Required for Windows compatibility with multiprocessing
482
+
483
+ if len(sys.argv) > 1:
484
+ # If called with file arguments, execute parsing logic (e.g., from subprocess)
485
+ process_pdfs_main()
486
+ else:
487
+ # Otherwise, launch the GUI application
488
+ root = tk.Tk()
489
+ app = FileManager(root)
490
+ root.mainloop()