Sudhanshu Pandey commited on
Commit
a7b8c18
·
1 Parent(s): b1cb0f5

Update the files

Browse files
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Sudhanshu Pandey
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
images/image3.png ADDED
images/image4.png ADDED
requirements.txt ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ albucore==0.0.13
2
+ albumentations==1.4.10
3
+ altair==5.5.0
4
+ annotated-types==0.7.0
5
+ anyio==4.8.0
6
+ asgiref==3.8.1
7
+ astor==0.8.1
8
+ attrs==24.3.0
9
+ beautifulsoup4==4.12.3
10
+ blinker==1.9.0
11
+ cachetools==5.5.0
12
+ certifi==2024.12.14
13
+ charset-normalizer==3.4.1
14
+ click==8.1.8
15
+ contourpy==1.3.1
16
+ crispy-bootstrap4==2024.10
17
+ cycler==0.12.1
18
+ Cython==3.0.11
19
+ decorator==5.1.1
20
+ filelock==3.16.1
21
+ filetype==1.2.0
22
+ fire==0.7.0
23
+ fonttools==4.55.3
24
+ fsspec==2024.12.0
25
+ gitdb==4.0.12
26
+ GitPython==3.1.44
27
+ h11==0.14.0
28
+ httpcore==1.0.7
29
+ httpx==0.28.1
30
+ hub-sdk==0.0.17
31
+ huggingface-hub==0.27.1
32
+ idna==3.7
33
+ imageio==2.36.1
34
+ imgaug==0.4.0
35
+ Jinja2==3.1.5
36
+ joblib==1.4.2
37
+ jsonschema==4.23.0
38
+ jsonschema-specifications==2024.10.1
39
+ kiwisolver==1.4.8
40
+ lazy_loader==0.4
41
+ lmdb==1.6.2
42
+ lxml==5.3.0
43
+ markdown-it-py==3.0.0
44
+ MarkupSafe==3.0.2
45
+ matplotlib==3.10.0
46
+ mdurl==0.1.2
47
+ mpmath==1.3.0
48
+ narwhals==1.22.0
49
+ networkx==3.4.2
50
+ numpy==1.26.4
51
+ opencv-contrib-python==4.10.0.84
52
+ opencv-python==4.10.0.84
53
+ opencv-python-headless==4.10.0.84
54
+ opt-einsum==3.3.0
55
+ packaging==24.2
56
+ paddleocr==2.9.1
57
+ paddlepaddle==2.6.2
58
+ pandas==2.2.3
59
+ pillow==11.1.0
60
+ protobuf==3.20.3
61
+ psutil==6.1.1
62
+ py-cpuinfo==9.0.0
63
+ pyarrow==18.1.0
64
+ pybboxes==0.1.6
65
+ pyclipper==1.3.0.post6
66
+ pydantic==2.10.5
67
+ pydantic_core==2.27.2
68
+ pydeck==0.9.1
69
+ Pygments==2.19.1
70
+ pyparsing==3.2.1
71
+ python-dateutil==2.9.0.post0
72
+ python-docx==1.1.2
73
+ python-dotenv==1.0.1
74
+ pytz==2024.2
75
+ PyYAML==6.0.2
76
+ RapidFuzz==3.11.0
77
+ referencing==0.35.1
78
+ requests==2.32.3
79
+ requests-toolbelt==1.0.0
80
+ rich==13.9.4
81
+ roboflow==1.1.50
82
+ rpds-py==0.22.3
83
+ sahi==0.11.20
84
+ scikit-image==0.25.0
85
+ scikit-learn==1.6.1
86
+ scipy==1.15.1
87
+ seaborn==0.13.2
88
+ setuptools==75.8.0
89
+ shapely==2.0.6
90
+ six==1.17.0
91
+ smmap==5.0.2
92
+ sniffio==1.3.1
93
+ soupsieve==2.6
94
+ sqlparse==0.5.3
95
+ streamlit==1.41.1
96
+ sympy==1.13.1
97
+ tenacity==9.0.0
98
+ termcolor==2.5.0
99
+ terminaltables==3.1.10
100
+ thop==0.1.1.post2209072238
101
+ threadpoolctl==3.5.0
102
+ tifffile==2025.1.10
103
+ toml==0.10.2
104
+ tomli==2.2.1
105
+ torch==2.5.1
106
+ torchvision==0.20.1
107
+ tornado==6.4.2
108
+ tqdm==4.67.1
109
+ typing_extensions==4.12.2
110
+ tzdata==2024.2
111
+ ultralytics==8.3.65
112
+ ultralytics-thop==2.0.14
113
+ ultralyticsplus==0.1.0
114
+ urllib3==2.3.0
src/__init__.py ADDED
File without changes
src/models/__init__.py ADDED
File without changes
src/models/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (170 Bytes). View file
 
src/models/__pycache__/table_detector.cpython-312.pyc ADDED
Binary file (5.76 kB). View file
 
src/models/__pycache__/text_recognizer.cpython-312.pyc ADDED
Binary file (6.64 kB). View file
 
src/models/paddleocr_models/det/inference.pdiparams.info ADDED
Binary file (26.4 kB). View file
 
src/models/paddleocr_models/rec/inference.pdiparams.info ADDED
Binary file (103 kB). View file
 
src/models/table_detector.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Optional, Union
3
+ import numpy as np
4
+ from ultralytics import YOLO
5
+ # from ultralyticsplus import YOLO
6
+
7
+
8
+ class TableDetector:
9
+ """
10
+ A class for detecting tables in document images using YOLO models.
11
+
12
+ Attributes:
13
+ model_path (Path): Path to the YOLO model weights
14
+ confidence (float): Confidence threshold for detection
15
+ iou_threshold (float): IoU threshold for NMS
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ confidence: float = 0.50,
21
+ iou_threshold: float = 0.45
22
+ ) -> None:
23
+ """
24
+ Initialize the TableDetector with model and parameters.
25
+
26
+ Args:
27
+ model_path: Path to the YOLO model weights
28
+ confidence: Confidence threshold for detection
29
+ iou_threshold: IoU threshold for NMS
30
+ """
31
+ self.model_path = 'src/models/table-detection-and-extraction.pt'
32
+ self.model = YOLO(str(self.model_path))
33
+ self.min_conf = confidence
34
+ self.iou = iou_threshold
35
+
36
+ def detect(self, image_path: Union[str, Path]) -> Optional[np.ndarray]:
37
+ """
38
+ Detect tables in the given image.
39
+
40
+ Args:
41
+ image_path: Path to the input image
42
+
43
+ Returns:
44
+ Array of bounding box coordinates or None if no tables detected
45
+ """
46
+ results = self.model.predict(str(image_path), verbose=False, iou = self.iou, conf = self.min_conf)
47
+ if results:
48
+ print('boxes :\n',results[0])
49
+ boxes = results[0].boxes.xyxy.numpy()
50
+ cord = self.merge_boxes(boxes)
51
+ print('cords : ',cord)
52
+ return [sorted(cord, key = lambda x : (x[2]-x[0])* (x[3]-x[1]), reverse=True)[0]] if len(cord) > 0 else []
53
+ return None
54
+
55
+ def merge_boxes(self, boxes: np.ndarray, overlap_threshold: float = 35) -> np.ndarray:
56
+ """
57
+ Merge overlapping bounding boxes.
58
+
59
+ Args:
60
+ boxes: Array of bounding box coordinates
61
+ overlap_threshold: Threshold for merging overlapping boxes
62
+
63
+ Returns:
64
+ Array of merged bounding box coordinates
65
+ """
66
+ # Sort boxes by area in descending order
67
+ areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
68
+ sorted_indices = np.argsort(-areas)
69
+ boxes = boxes[sorted_indices]
70
+
71
+ merged_boxes = []
72
+
73
+ for box in boxes:
74
+ if not merged_boxes:
75
+ merged_boxes.append(box)
76
+ continue
77
+
78
+ overlap_found = False
79
+ for i, merged_box in enumerate(merged_boxes):
80
+ iou = self._calculate_overlap(box, merged_box)
81
+ if iou > overlap_threshold:
82
+ # Keep the larger box
83
+ box_area = (box[2] - box[0]) * (box[3] - box[1])
84
+ merged_area = (merged_box[2] - merged_box[0]) * (merged_box[3] - merged_box[1])
85
+ if box_area > merged_area:
86
+ merged_boxes[i] = box
87
+ overlap_found = True
88
+ break
89
+
90
+ if not overlap_found:
91
+ merged_boxes.append(box)
92
+
93
+ return np.array(merged_boxes).astype(int)
94
+
95
+ @staticmethod
96
+ def _calculate_overlap(box1: np.ndarray, box2: np.ndarray) -> float:
97
+ """
98
+ Calculate the percentage overlap between two boxes.
99
+
100
+ Args:
101
+ box1: First bounding box coordinates
102
+ box2: Second bounding box coordinates
103
+
104
+ Returns:
105
+ Percentage of overlap between the boxes
106
+ """
107
+ x_left = max(box1[0], box2[0])
108
+ y_top = max(box1[1], box2[1])
109
+ x_right = min(box1[2], box2[2])
110
+ y_bottom = min(box1[3], box2[3])
111
+
112
+ if x_right < x_left or y_bottom < y_top:
113
+ return 0.0
114
+
115
+ intersection_area = (x_right - x_left) * (y_bottom - y_top)
116
+ box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
117
+ box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
118
+
119
+ min_area = min(box1_area, box2_area)
120
+ if min_area == 0:
121
+ return 0.0
122
+
123
+ return (intersection_area / min_area) * 100
src/models/text_recognizer.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import List, Optional, Dict, Union
3
+ import numpy as np
4
+ import pandas as pd
5
+ from paddleocr import PaddleOCR
6
+ from PIL import Image
7
+
8
+ class TextRecognizer:
9
+ """
10
+ A class for performing OCR on detected tables using PaddleOCR.
11
+
12
+ Attributes:
13
+ models_dir (Path): Directory containing OCR model files
14
+ """
15
+
16
+ def __init__(self, models_dir: Optional[Union[str, Path]] = None) -> None:
17
+ """
18
+ Initialize the TextRecognizer with model directory.
19
+
20
+ Args:
21
+ models_dir: Directory containing OCR model files
22
+ """
23
+ self.models_dir = Path(models_dir) if models_dir else Path(__file__).parent / 'paddleocr_models'
24
+ self._setup_model_dirs()
25
+
26
+ self.model = PaddleOCR(
27
+ use_angle_cls=False,
28
+ lang='en',
29
+ det_model_dir=str(self.models_dir / 'det'),
30
+ rec_model_dir=str(self.models_dir / 'rec')
31
+ )
32
+
33
+ def _setup_model_dirs(self) -> None:
34
+ """Create necessary directories for model files."""
35
+ (self.models_dir / 'det').mkdir(parents=True, exist_ok=True)
36
+ (self.models_dir / 'rec').mkdir(parents=True, exist_ok=True)
37
+
38
+ def recognize(
39
+ self,
40
+ image_path: Union[str, Path],
41
+ table_boxes: Optional[np.ndarray] = None,
42
+ padding: tuple = (0, 0)
43
+ ) -> List[pd.DataFrame]:
44
+ """
45
+ Perform OCR on the image within specified table regions.
46
+
47
+ Args:
48
+ image_path: Path to the input image
49
+ table_boxes: Array of table bounding box coordinates
50
+ padding: Padding to add around table regions (x, y)
51
+
52
+ Returns:
53
+ List of DataFrames containing extracted text and positions
54
+ """
55
+ with Image.open(image_path) as img:
56
+ img_array = np.array(img.convert('RGB'))
57
+
58
+ if table_boxes is not None and len(table_boxes) == 1:
59
+ pad_x, pad_y = padding
60
+ box = table_boxes[0]
61
+ img_array = img_array[
62
+ max(box[1]-pad_y, 0):box[3]+pad_y,
63
+ max(box[0]-pad_x, 0):box[2]+pad_x
64
+ ]
65
+
66
+ ocr_result = self.model.ocr(img_array)
67
+
68
+ if table_boxes is not None and len(table_boxes) > 1:
69
+ return self._process_multiple_tables(ocr_result[0], table_boxes)
70
+ return self._process_single_table(ocr_result[0])
71
+
72
+ def _process_multiple_tables(
73
+ self,
74
+ ocr_data: List,
75
+ table_boxes: np.ndarray
76
+ ) -> List[pd.DataFrame]:
77
+ """Process OCR results for multiple tables."""
78
+ result: Dict[int, List] = {}
79
+
80
+ for item in ocr_data:
81
+ bbox = np.array(item[0]).astype(int)
82
+ word = item[1][0]
83
+ bbox = [bbox[:,0].min(), bbox[:,1].min(), bbox[:,0].max(), bbox[:,1].max()]
84
+
85
+ for idx, table_box in enumerate(table_boxes):
86
+ if (bbox[0] >= table_box[0] and bbox[1] >= table_box[1] and
87
+ bbox[0] <= table_box[2] and bbox[1] <= table_box[3]):
88
+ if idx not in result:
89
+ result[idx] = []
90
+ result[idx].append((word, bbox))
91
+
92
+ return [
93
+ pd.DataFrame(
94
+ sorted(table_data, key=lambda x: (x[1][1], x[1][0])),
95
+ columns=['text', 'boundingBox']
96
+ )
97
+ for table_data in result.values()
98
+ ]
99
+
100
+ def _process_single_table(self, ocr_data: List) -> List[pd.DataFrame]:
101
+ """Process OCR results for a single table."""
102
+ processed_data = [
103
+ (item[1][0], [
104
+ np.array(item[0])[:,0].min(),
105
+ np.array(item[0])[:,1].min(),
106
+ np.array(item[0])[:,0].max(),
107
+ np.array(item[0])[:,1].max()
108
+ ])
109
+ for item in ocr_data
110
+ ]
111
+
112
+ return [pd.DataFrame(
113
+ sorted(processed_data, key=lambda x: (x[1][1], x[1][0])),
114
+ columns=['text', 'boundingBox']
115
+ )]
src/streamlit_app.py ADDED
@@ -0,0 +1,475 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from table_creator.table_extractor import TableExtraction
2
+ import streamlit as st
3
+ import base64
4
+ from PIL import Image
5
+ import os
6
+ import cv2
7
+ import numpy as np
8
+ import tempfile
9
+ import traceback
10
+
11
+ # Load models only once
12
+ if 'tab_ext' not in st.session_state:
13
+ st.session_state.tab_ext = TableExtraction()
14
+ print('Models loaded.')
15
+
16
+
17
+ def process_image(imgpath):
18
+ return st.session_state.tab_ext.detect(imgpath)
19
+
20
+ def draw_bounding_box(image, bbox):
21
+ """Draw a bounding box on the image"""
22
+
23
+
24
+ img_array = np.array(image)
25
+ if len(img_array.shape) == 3:
26
+ img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
27
+
28
+ x_min, y_min, x_max, y_max = bbox
29
+ cv2.rectangle(img_array, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
30
+
31
+ if len(img_array.shape) == 3:
32
+ img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)
33
+
34
+ return Image.fromarray(img_array)
35
+
36
+
37
+ # Set page config
38
+ st.set_page_config(
39
+ page_title="Table Extraction Tool",
40
+ layout="wide",
41
+ initial_sidebar_state="expanded" # Changed to expanded to show guide by default
42
+ )
43
+
44
+
45
+ # Enhanced CSS styling with updated upload section
46
+ st.markdown("""
47
+ <style>
48
+ /* Main container and background */
49
+ .main { padding: 1.5rem; }
50
+ .stApp {
51
+ background: linear-gradient(135deg, #f6f9fc 0%, #f0f4f8 100%);
52
+ }
53
+
54
+ /* Header styling */
55
+ .main-header {
56
+ background: linear-gradient(90deg, #1a365d 0%, #2563eb 100%);
57
+ color: white;
58
+ padding: 2rem 3rem;
59
+ border-radius: 15px;
60
+ margin-bottom: 2rem;
61
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
62
+ }
63
+
64
+ .main-header h1 {
65
+ font-size: 2.5rem;
66
+ margin-bottom: 0.5rem;
67
+ font-weight: 600;
68
+ color: white;
69
+ }
70
+
71
+ .main-header p {
72
+ font-size: 1.1rem;
73
+ opacity: 0.9;
74
+ }
75
+
76
+ /* Card containers */
77
+ .content-card {
78
+ background-color: white;
79
+ padding: 1.5rem;
80
+ border-radius: 12px;
81
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
82
+ border: 1px solid #e5e7eb;
83
+ margin-bottom: 1.5rem;
84
+ }
85
+
86
+ /* Upload section - Reduced size */
87
+ .upload-section {
88
+ text-align: center;
89
+ padding: 1rem;
90
+ border: 2px dashed #e5e7eb;
91
+ border-radius: 12px;
92
+ background-color: #f8fafc;
93
+ max-width: 600px;
94
+ margin: 0 auto;
95
+ }
96
+
97
+ .upload-icon {
98
+ font-size: 1.5rem;
99
+ color: #2563eb;
100
+ margin-bottom: 0.5rem;
101
+ }
102
+
103
+ /* Results section */
104
+ .results-header {
105
+ font-size: 1.25rem;
106
+ color: #1f2937;
107
+ margin-bottom: 1rem;
108
+ padding-bottom: 0.5rem;
109
+ border-bottom: 2px solid #e5e7eb;
110
+ }
111
+
112
+ /* Download buttons */
113
+ .download-button {
114
+ display: inline-block;
115
+ padding: 0.75rem 1.5rem;
116
+ background-color: #2563eb;
117
+ color: white;
118
+ text-decoration: none;
119
+ border-radius: 8px;
120
+ transition: all 0.2s;
121
+ text-align: center;
122
+ width: 100%;
123
+ }
124
+
125
+ .download-button:hover {
126
+ background-color: #1d4ed8;
127
+ box-shadow: 0 4px 6px -1px rgba(37, 99, 235, 0.2);
128
+ }
129
+
130
+ /* Tabs styling */
131
+ .stTabs [data-baseweb="tab-list"] {
132
+ gap: 1rem;
133
+ background-color: #f8fafc;
134
+ padding: 0.5rem;
135
+ border-radius: 8px;
136
+ }
137
+
138
+ .stTabs [data-baseweb="tab"] {
139
+ color: #4b5563;
140
+ font-weight: 500;
141
+ padding: 0.5rem 1.5rem;
142
+ border-radius: 6px;
143
+ }
144
+
145
+ .stTabs [data-baseweb="tab"][aria-selected="true"] {
146
+ background-color: #2563eb;
147
+ color: white;
148
+ }
149
+
150
+ /* Guide section styling */
151
+ .guide-section {
152
+ background-color: white;
153
+ padding: 2rem;
154
+ border-radius: 12px;
155
+ margin-bottom: 1.5rem;
156
+ }
157
+
158
+ .guide-header {
159
+ color: #1a365d;
160
+ font-size: 1.5rem;
161
+ margin-bottom: 1rem;
162
+ border-bottom: 2px solid #e5e7eb;
163
+ padding-bottom: 0.5rem;
164
+ }
165
+
166
+ .guide-subheader {
167
+ color: #2563eb;
168
+ font-size: 1.2rem;
169
+ margin: 1.5rem 0 0.5rem 0;
170
+ }
171
+
172
+ .guide-text {
173
+ color: #4b5563;
174
+ line-height: 1.6;
175
+ margin-bottom: 1rem;
176
+ }
177
+
178
+ .feature-card {
179
+ background-color: #f8fafc;
180
+ padding: 1rem;
181
+ border-radius: 8px;
182
+ margin-bottom: 1rem;
183
+ border-left: 4px solid #2563eb;
184
+ }
185
+
186
+ .step-container {
187
+ display: flex;
188
+ align-items: flex-start;
189
+ margin-bottom: 1rem;
190
+ }
191
+
192
+ .step-number {
193
+ background-color: #2563eb;
194
+ color: white;
195
+ width: 24px;
196
+ height: 24px;
197
+ border-radius: 12px;
198
+ display: flex;
199
+ align-items: center;
200
+ justify-content: center;
201
+ margin-right: 1rem;
202
+ flex-shrink: 0;
203
+ }
204
+
205
+ .info-icon {
206
+ color: #2563eb;
207
+ margin-right: 0.5rem;
208
+ }
209
+
210
+ .tech-details {
211
+ background-color: #f0f9ff;
212
+ padding: 1rem;
213
+ border-radius: 8px;
214
+ margin: 1rem 0;
215
+ }
216
+
217
+ </style>
218
+ """, unsafe_allow_html=True)
219
+
220
+
221
+
222
+ # Create sidebar with guide content
223
+ with st.sidebar:
224
+ # st.markdown('<div class="guide-section">', unsafe_allow_html=True)
225
+ st.divider()
226
+ st.markdown('<h2 class="guide-header">📚 User Guide</h2>', unsafe_allow_html=True)
227
+
228
+ # How It Works section
229
+ st.markdown('<h3 class="guide-subheader">🎯 How It Works</h3>', unsafe_allow_html=True)
230
+ st.markdown("""
231
+ <div class="guide-text">
232
+ This tool uses advanced computer vision and machine learning techniques to:
233
+ <ul>
234
+ <li>Detect and locate tables in document images</li>
235
+ <li>Extract structured data from the detected tables</li>
236
+ <li>Convert the data into easily manageable formats</li>
237
+ </ul>
238
+ </div>
239
+ """, unsafe_allow_html=True)
240
+
241
+ # Usage Instructions
242
+ st.markdown('<h3 class="guide-subheader">📝 Usage Instructions</h3>', unsafe_allow_html=True)
243
+
244
+ st.markdown("""
245
+ <div class="step-container">
246
+ <div class="step-number">1</div>
247
+ <div class="guide-text">Upload a document image containing a table (PNG, JPG, or JPEG format)</div>
248
+ </div>
249
+
250
+ <div class="step-container">
251
+ <div class="step-number">2</div>
252
+ <div class="guide-text">The tool will automatically detect and highlight the table in your image</div>
253
+ </div>
254
+
255
+ <div class="step-container">
256
+ <div class="step-number">3</div>
257
+ <div class="guide-text">View both raw and enhanced versions of the extracted data</div>
258
+ </div>
259
+
260
+ <div class="step-container">
261
+ <div class="step-number">4</div>
262
+ <div class="guide-text">Download the results in CSV format for further use</div>
263
+ </div>
264
+ """, unsafe_allow_html=True)
265
+
266
+ # Best Practices
267
+ st.markdown('<h3 class="guide-subheader">💡 Best Practices</h3>', unsafe_allow_html=True)
268
+ st.markdown("""
269
+ <div class="feature-card">
270
+ <strong>For Best Results:</strong>
271
+ <ul>
272
+ <li>Use clear, high-resolution images</li>
273
+ <li>Ensure tables have well-defined borders</li>
274
+ <li>Avoid skewed or rotated images</li>
275
+ <li>Make sure text is clearly readable</li>
276
+ </ul>
277
+ </div>
278
+ """, unsafe_allow_html=True)
279
+
280
+ # Technical Details (collapsible)
281
+ with st.expander("🔧 Technical Details"):
282
+ st.markdown("""
283
+ <div class="tech-details">
284
+ <p><strong>Algorithm Overview:</strong></p>
285
+ <ul>
286
+ <li>Uses computer vision for table boundary detection</li>
287
+ <li>Employs OCR (Optical Character Recognition) for text extraction</li>
288
+ <li>Implements intelligent cell segmentation</li>
289
+ <li>Applies post-processing for enhanced accuracy</li>
290
+ </ul>
291
+ </div>
292
+ """, unsafe_allow_html=True)
293
+
294
+ # Support Info
295
+ st.markdown('<h3 class="guide-subheader">🔗 Connect with Me</h3>', unsafe_allow_html=True)
296
+ st.markdown("""
297
+ <div class="guide-text" style="font-size: 1rem;">
298
+ If you encounter any issues or have questions, feel free to reach out:
299
+ <a href="https://github.com/Sudhanshu1304" target="_blank" style="text-decoration: none;">
300
+ <img src="https://img.icons8.com/ios-filled/20/000000/github.png" alt="GitHub" style="vertical-align: middle; margin-right: 5px;"/>
301
+ GitHub
302
+ </a> |
303
+ <a href="https://www.linkedin.com/in/sudhanshu-pandey-847448193/" target="_blank" style="text-decoration: none;">
304
+ <img src="https://img.icons8.com/ios-filled/20/000000/linkedin.png" alt="LinkedIn" style="vertical-align: middle; margin-right: 5px;"/>
305
+ LinkedIn
306
+ </a> |
307
+ <a href="https://medium.com/@sudhanshu.dpandey" target="_blank" style="text-decoration: none;">
308
+ <img src="https://img.icons8.com/ios-filled/20/000000/medium-logo.png" alt="Medium" style="vertical-align: middle; margin-right: 5px;"/>
309
+ Medium
310
+ </a>
311
+ </div>
312
+ """, unsafe_allow_html=True)
313
+
314
+
315
+
316
+ # Initialize session state for expanded view
317
+ if 'is_expanded' not in st.session_state:
318
+ st.session_state.is_expanded = False
319
+
320
+
321
+ # Title and description
322
+ st.markdown("""
323
+ <div class="main-header">
324
+ <h1>📊 Table Extraction Tool</h1>
325
+ <p>Upload an image containing tables and instantly convert them into structured data formats.</p>
326
+ </div>
327
+ """, unsafe_allow_html=True)
328
+
329
+
330
+
331
+ # File upload section - Reduced size
332
+ # st.markdown('<div class="content-card">', unsafe_allow_html=True)
333
+ # st.markdown("""
334
+ # <div class="upload-section">
335
+ # <div class="upload-icon">📥</div>
336
+ # <h3 style="font-size: 1.1rem; margin: 0.5rem 0;">Upload Table Image</h3>
337
+ # <p style="font-size: 0.9rem; margin: 0;">Supported formats: PNG, JPG, JPEG</p>
338
+ # </div>
339
+ # """, unsafe_allow_html=True)
340
+ uploaded_file = st.file_uploader("", type=['png', 'jpg', 'jpeg'])
341
+ st.markdown('</div>', unsafe_allow_html=True)
342
+
343
+ # Process the uploaded file
344
+ if uploaded_file is not None:
345
+ with st.spinner('🔄 Processing your image...'):
346
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
347
+ tmp_file.write(uploaded_file.getvalue())
348
+ temp_path = tmp_file.name
349
+
350
+ try:
351
+ image = Image.open(uploaded_file)
352
+ (raw_df, cleaned_df), bbox = process_image(temp_path)
353
+
354
+ st.session_state.raw_data = raw_df
355
+ st.session_state.processed_data = cleaned_df
356
+ marked_image = draw_bounding_box(image, bbox[0])
357
+ st.session_state.marked_image = marked_image
358
+
359
+ # Side by side layout
360
+ col1, col2 = st.columns([0.4, 0.6])
361
+
362
+ with col1:
363
+ # st.markdown('<div class="content-card image-container">', unsafe_allow_html=True)
364
+ st.divider()
365
+ st.markdown('<h3 class="results-header">Detected Table</h3>', unsafe_allow_html=True)
366
+ st.image(marked_image, use_container_width=True)
367
+ st.markdown('</div>', unsafe_allow_html=True)
368
+
369
+ with col2:
370
+ # st.markdown('<div class="content-card">', unsafe_allow_html=True)
371
+ st.divider()
372
+ st.markdown('<h3 class="results-header">Extracted Data</h3>', unsafe_allow_html=True)
373
+
374
+ # # Toggle button for expanded view
375
+ # if st.button("🔍 Toggle Full View" if not st.session_state.is_expanded else "⬆️ Collapse View"):
376
+ # st.session_state.is_expanded = not st.session_state.is_expanded
377
+
378
+ tabs = st.tabs(["🔍 Raw Data", "✨ Enhanced Data ⭐"])
379
+
380
+ with tabs[0]:
381
+ st.dataframe(st.session_state.raw_data,
382
+ use_container_width=True,
383
+ height=600 if not st.session_state.is_expanded else None)
384
+
385
+ # Add HTML copy section for raw data
386
+ st.markdown("### 📋 Copy HTML Table")
387
+ html_raw = st.session_state.raw_data.to_html(index=False)
388
+ st.markdown("""
389
+ <div style="background-color: #f8fafc; padding: 0.5rem; border-radius: 8px; margin-bottom: 0.5rem;">
390
+ <p style="margin: 0; color: #475569; font-size: 0.9rem;">
391
+ ℹ️ This HTML can be copied and used directly in websites, LLM prompts, or other applications.
392
+ </p>
393
+ </div>
394
+ """, unsafe_allow_html=True)
395
+ st.markdown("""
396
+ <div style="max-height: 150px; overflow-y: auto; border-radius: 8px;">
397
+ """, unsafe_allow_html=True)
398
+ st.code(html_raw, language="html")
399
+ st.markdown("</div>", unsafe_allow_html=True)
400
+
401
+ with tabs[1]:
402
+ st.markdown("""
403
+ <div style="background-color: #f0f9ff; padding: 1rem; border-radius: 8px; margin-bottom: 1rem;">
404
+ <p style="margin: 0; color: #1e40af;">
405
+ ⭐ This is our enhanced version of the table with improved formatting and structure.
406
+ </p>
407
+ </div>
408
+ """, unsafe_allow_html=True)
409
+ st.dataframe(st.session_state.processed_data,
410
+ use_container_width=True,
411
+ height=600 if not st.session_state.is_expanded else None)
412
+
413
+ # Add HTML copy section for enhanced data
414
+ st.markdown("### 📋 Copy HTML Table")
415
+ html_enhanced = st.session_state.processed_data.to_html(index=False)
416
+ st.markdown("""
417
+ <div style="background-color: #f8fafc; padding: 0.5rem; border-radius: 8px; margin-bottom: 0.5rem;">
418
+ <p style="margin: 0; color: #475569; font-size: 0.9rem;">
419
+ ℹ️ This HTML can be copied and used directly in websites, LLM prompts, or other applications.
420
+ </p>
421
+ </div>
422
+ """, unsafe_allow_html=True)
423
+ st.markdown("""
424
+ <div style="max-height: 150px; overflow-y: auto; border-radius: 8px;">
425
+ """, unsafe_allow_html=True)
426
+ st.code(html_enhanced, language="html")
427
+ st.markdown("</div>", unsafe_allow_html=True)
428
+
429
+ # st.markdown('</div>', unsafe_allow_html=True)
430
+
431
+ # Download section below both columns
432
+ # st.markdown('<div class="content-card">', unsafe_allow_html=True)
433
+ # Download section below both columns
434
+ st.divider()
435
+ st.markdown('<h3 class="results-header">Download Options</h3>', unsafe_allow_html=True)
436
+ download_cols = st.columns([1, 0.1, 1])
437
+
438
+ def get_csv_download_link(df, filename):
439
+ csv = df.to_csv(index=False).encode()
440
+ b64 = base64.b64encode(csv).decode()
441
+ return f'<a href="data:file/csv;base64,{b64}" download="{filename}" class="download-button">📥 Download {filename}</a>'
442
+
443
+ with download_cols[0]:
444
+ if 'raw_data' in st.session_state:
445
+ csv = st.session_state.raw_data.to_csv(index=False)
446
+ st.download_button(
447
+ label="📥 Download Raw Data",
448
+ data=csv,
449
+ file_name="raw_data.csv",
450
+ mime="text/csv",
451
+ use_container_width=True,
452
+ key="raw_download"
453
+ )
454
+
455
+ with download_cols[2]:
456
+ if 'processed_data' in st.session_state:
457
+ csv = st.session_state.processed_data.to_csv(index=False)
458
+ st.download_button(
459
+ label="📥 Download Enhanced Data ⭐",
460
+ data=csv,
461
+ file_name="enhanced_data.csv",
462
+ mime="text/csv",
463
+ use_container_width=True,
464
+ key="enhanced_download"
465
+ )
466
+ st.markdown('</div>', unsafe_allow_html=True)
467
+
468
+ except Exception as e:
469
+ st.error(f"❌ Error processing image: {str(traceback.format_exc())}")
470
+
471
+ finally:
472
+ try:
473
+ os.unlink(temp_path)
474
+ except Exception as e:
475
+ st.warning(f"⚠️ Error removing temporary file: {str(e)}")
src/table_creator/__pycache__/data_structures.cpython-312.pyc ADDED
Binary file (8.74 kB). View file
 
src/table_creator/__pycache__/table_extractor.cpython-312.pyc ADDED
Binary file (9.27 kB). View file
 
src/table_creator/data_structures.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Dict, List, Optional, Tuple
3
+ import pandas as pd
4
+ import numpy as np
5
+
6
+ @dataclass
7
+ class TableCell:
8
+ """
9
+ Represents a cell in a table with its value and position.
10
+
11
+ Attributes:
12
+ value: The text content of the cell
13
+ bbox: Bounding box coordinates [x1, y1, x2, y2]
14
+ column_name: Name of the column this cell belongs to
15
+ """
16
+ value: str
17
+ bbox: List[int]
18
+ column_name: str
19
+
20
+ @dataclass
21
+ class TableRow:
22
+ """
23
+ Represents a row in a table with its cells and boundaries.
24
+
25
+ Attributes:
26
+ cells: Dictionary of column name to TableCell
27
+ min_x: Minimum x coordinate of the row
28
+ max_x: Maximum x coordinate of the row
29
+ min_y: Minimum y coordinate of the row
30
+ max_y: Maximum y coordinate of the row
31
+ """
32
+ cells: Dict[str, TableCell]
33
+ min_x: float
34
+ max_x: float
35
+ min_y: float
36
+ max_y: float
37
+
38
+ class TableStructure:
39
+ """
40
+ Maintains the structure of a table using a linked list representation.
41
+ """
42
+
43
+ def __init__(self, debug: bool = False) -> None:
44
+ """
45
+ Initialize the table structure.
46
+
47
+ Args:
48
+ debug: Enable debug logging
49
+ """
50
+ self.rows: List[TableRow] = []
51
+ self.debug = debug
52
+
53
+ def build_structure(self, dataframes: Dict[str, pd.DataFrame]) -> pd.DataFrame:
54
+ """
55
+ Build table structure from column-wise dataframes.
56
+
57
+ Args:
58
+ dataframes: Dictionary of column name to DataFrame containing text and positions
59
+
60
+ Returns:
61
+ DataFrame with structured table data
62
+ """
63
+ if not dataframes:
64
+ return pd.DataFrame()
65
+
66
+ # Initialize with first column
67
+ first_col = list(dataframes.keys())[0]
68
+ self._initialize_rows(first_col, dataframes[first_col])
69
+
70
+ # Process remaining columns
71
+ for col_name in list(dataframes.keys())[1:]:
72
+ self._process_column(col_name, dataframes[col_name])
73
+
74
+ return self._to_dataframe(dataframes.keys())
75
+
76
+ def _initialize_rows(self, column_name: str, df: pd.DataFrame) -> None:
77
+ """Initialize rows with the first column's data."""
78
+ for _, row in df.iterrows():
79
+ bbox = row['boundingBox']
80
+ self.rows.append(TableRow(
81
+ cells={column_name: TableCell(row['text'], bbox, column_name)},
82
+ min_x=bbox[0],
83
+ max_x=bbox[2],
84
+ min_y=bbox[1],
85
+ max_y=bbox[3]
86
+ ))
87
+
88
+ def _process_column(self, column_name: str, df: pd.DataFrame) -> None:
89
+ """Process additional columns and align with existing rows."""
90
+ search_idx = 0
91
+
92
+ for _, row in df.iterrows():
93
+ text = row['text']
94
+ bbox = row['boundingBox']
95
+
96
+ matched = False
97
+ for idx, table_row in enumerate(self.rows[search_idx:], search_idx):
98
+ overlap = self._calculate_overlap(
99
+ bbox,
100
+ [bbox[0], table_row.min_y, bbox[2], table_row.max_y]
101
+ )
102
+
103
+ if overlap > 10:
104
+ self._update_row(idx, column_name, text, bbox)
105
+ search_idx = idx + 1
106
+ matched = True
107
+ break
108
+ elif bbox[3] <= table_row.min_y:
109
+ self._insert_row(idx, column_name, text, bbox)
110
+ search_idx = idx + 1
111
+ matched = True
112
+ break
113
+
114
+ if not matched and bbox[1] >= self.rows[-1].max_y:
115
+ self._append_row(column_name, text, bbox)
116
+
117
+ def _calculate_overlap(self, rect1: List[int], rect2: List[int]) -> float:
118
+ """Calculate percentage overlap between two rectangles."""
119
+ x_left = max(rect1[0], rect2[0])
120
+ y_top = max(rect1[1], rect2[1])
121
+ x_right = min(rect1[2], rect2[2])
122
+ y_bottom = min(rect1[3], rect2[3])
123
+
124
+ if x_right < x_left or y_bottom < y_top:
125
+ return 0.0
126
+
127
+ intersection = (x_right - x_left) * (y_bottom - y_top)
128
+ min_area = min(
129
+ (rect1[2] - rect1[0]) * (rect1[3] - rect1[1]),
130
+ (rect2[2] - rect2[0]) * (rect2[3] - rect2[1])
131
+ )
132
+
133
+ return (intersection / min_area * 100) if min_area > 0 else 0
134
+
135
+ def _update_row(self, idx: int, column_name: str, text: str, bbox: List[int]) -> None:
136
+ """Update existing row with new cell data."""
137
+ self.rows[idx].cells[column_name] = TableCell(text, bbox, column_name)
138
+ self.rows[idx].min_x = min(self.rows[idx].min_x, bbox[0])
139
+ self.rows[idx].max_x = max(self.rows[idx].max_x, bbox[2])
140
+
141
+ def _insert_row(self, idx: int, column_name: str, text: str, bbox: List[int]) -> None:
142
+ """Insert new row at specified index."""
143
+ self.rows.insert(idx, TableRow(
144
+ cells={column_name: TableCell(text, bbox, column_name)},
145
+ min_x=bbox[0],
146
+ max_x=bbox[2],
147
+ min_y=bbox[1],
148
+ max_y=bbox[3]
149
+ ))
150
+
151
+ def _append_row(self, column_name: str, text: str, bbox: List[int]) -> None:
152
+ """Append new row at the end."""
153
+ self.rows.append(TableRow(
154
+ cells={column_name: TableCell(text, bbox, column_name)},
155
+ min_x=bbox[0],
156
+ max_x=bbox[2],
157
+ min_y=bbox[1],
158
+ max_y=bbox[3]
159
+ ))
160
+
161
+ def _to_dataframe(self, columns: List[str]) -> pd.DataFrame:
162
+ """Convert table structure to DataFrame."""
163
+ data = []
164
+ for row in self.rows:
165
+ row_data = {
166
+ col: row.cells[col].value if col in row.cells else None
167
+ for col in columns
168
+ }
169
+ row_data.update({
170
+ 'row_min_x': row.min_x,
171
+ 'row_max_x': row.max_x,
172
+ 'row_min_y': row.min_y,
173
+ 'row_max_y': row.max_y
174
+ })
175
+ data.append(row_data)
176
+
177
+ return pd.DataFrame(data)
src/table_creator/table_extractor.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from models.table_detector import TableDetector
2
+ from models.text_recognizer import TextRecognizer
3
+ from table_creator.data_structures import TableStructure
4
+ import pandas as pd
5
+ import re
6
+
7
+ class TableExtraction:
8
+ def __init__(self) -> None:
9
+ self._table_detection = TableDetector()
10
+ self._document_ocr = TextRecognizer()
11
+ self._linklist = TableStructure()
12
+
13
+ def _merge_words(self, prev_obj, word, word_bb):
14
+ """Merge the current word with the previous one if they overlap significantly."""
15
+ merged_text = prev_obj[0] + ' ' + word
16
+ merged_bb = [
17
+ prev_obj[1][0], prev_obj[1][1], word_bb[2], word_bb[3]
18
+ ]
19
+ return (merged_text, merged_bb)
20
+
21
+ def _assign_to_column(self, word, word_bb, columns, df, debug=False):
22
+ """Assign a word to the correct column based on bounding box overlap."""
23
+ for key, col_bb in columns.items():
24
+ word_bb_temp = [word_bb[0], col_bb[1], word_bb[2], col_bb[3]]
25
+ overlap = self._table_detection._calculate_overlap(word_bb_temp, col_bb)
26
+
27
+ if overlap > 10:
28
+ if len(df[key]) > 0:
29
+ prev_obj = df[key][-1]
30
+ prev_overlap = self._table_detection._calculate_overlap(
31
+ prev_obj[1], [prev_obj[1][0], word_bb[1], prev_obj[1][2], word_bb[3]]
32
+ )
33
+ if prev_overlap >= 30:
34
+ word, word_bb = self._merge_words(prev_obj, word, word_bb)
35
+ df[key][-1] = (word, word_bb)
36
+ else:
37
+ df[key].append((word, word_bb))
38
+ else:
39
+ df[key].append((word, word_bb))
40
+ # Dynamically adjust the column bounding box to fit the new word
41
+ columns[key] = [
42
+ min(word_bb[0], col_bb[0]), col_bb[1],
43
+ max(word_bb[2], col_bb[2]), col_bb[3]
44
+ ]
45
+ return True
46
+ return False
47
+
48
+ def _get_normalized_bounding_box(self, imgsz : str, bb : list) -> pd.DataFrame:
49
+ names = ['pdf1','sample_pdf2.pdf']
50
+ pass
51
+
52
+ def get_words_in_column(self, cords: dict, df_word: pd.DataFrame, merge=True, debug=False):
53
+ """Distribute words into their respective columns based on bounding box coordinates."""
54
+ df = {key: [] for key in cords}
55
+ unknown_columns = {}
56
+ unknown_data = {}
57
+
58
+ for index, row in df_word.iterrows():
59
+ word, word_bb = row['text'], list(map(int, row['boundingBox']))
60
+ if debug:
61
+ print(f"\nProcessing word: '{word}'")
62
+
63
+ if not self._assign_to_column(word, word_bb, cords, df, debug):
64
+ # Handle words that do not match any known column
65
+ for key, val in unknown_columns.items():
66
+ overlap = self._table_detection._calculate_overlap(
67
+ val, [word_bb[0], val[1], word_bb[2], val[3]]
68
+ )
69
+ if overlap > 30:
70
+ prev_obj = unknown_data[key][-1]
71
+ prev_overlap = self._table_detection._calculate_overlap(
72
+ prev_obj[1], [prev_obj[1][0], word_bb[1], prev_obj[1][2], word_bb[3]]
73
+ )
74
+ if prev_overlap >= 30:
75
+ word, word_bb = self._merge_words(prev_obj, word, word_bb)
76
+ unknown_data[key][-1] = (word, word_bb)
77
+ else:
78
+ unknown_data[key].append((word, word_bb))
79
+ break
80
+ else:
81
+ # Create a new unknown column if no match is found
82
+ unknown_key = f'{word}__{index}__'
83
+ unknown_columns[unknown_key] = word_bb
84
+ unknown_data[unknown_key] = [(word, word_bb)]
85
+
86
+ if merge:
87
+ df.update(unknown_data)
88
+
89
+ # Convert lists to DataFrames
90
+ df = {key: pd.DataFrame(val, columns=['text', 'boundingBox']) for key, val in df.items()}
91
+ return df, unknown_data, unknown_columns
92
+
93
+ def postprocess(self, parsed_df: pd.DataFrame, columns=None):
94
+ """Post-process the parsed DataFrame to merge columns and clean data."""
95
+ try:
96
+ parsed_df = parsed_df.dropna(how='all').reset_index(drop=True)
97
+ new_df = pd.DataFrame()
98
+
99
+ # Merge adjacent empty header columns
100
+ empty_columns = parsed_df.columns[parsed_df.iloc[:1].isna().all()].tolist()
101
+ for col in empty_columns[::-1]:
102
+ col_idx = list(parsed_df.columns).index(col)
103
+ if col_idx > 0:
104
+ parsed_df.iloc[:, col_idx - 1] += ' ' + parsed_df.iloc[:, col_idx]
105
+ parsed_df = parsed_df.drop(columns=empty_columns)
106
+
107
+ if not columns:
108
+ return parsed_df
109
+
110
+ used_indices = set()
111
+ for header in columns:
112
+ match_indices = [i for i, col in enumerate(parsed_df.columns) if header in col]
113
+ if match_indices:
114
+ used_indices.update(match_indices)
115
+ new_df[header] = parsed_df.iloc[:, match_indices].apply(
116
+ lambda x: ' '.join(x.fillna('').str.strip()), axis=1
117
+ )
118
+
119
+ # Include unused columns
120
+ unused_columns = [col for i, col in enumerate(parsed_df.columns) if i not in used_indices]
121
+ new_df = pd.concat([new_df, parsed_df[unused_columns]], axis=1)
122
+
123
+ return new_df
124
+ except Exception as e:
125
+ print(f"Error in postprocess: {e}")
126
+ return parsed_df
127
+
128
+ def detect(self, image_path: str):
129
+ """Detect tables in an image and extract their data."""
130
+ cords = self._table_detection.detect(image_path)
131
+ all_table_df = self._document_ocr.recognize(image_path, cords)
132
+
133
+ table_data = []
134
+ for table in all_table_df:
135
+ column_data, _, _ = self.get_words_in_column({}, table)
136
+ ordered_columns = sorted(column_data, key=lambda x: column_data[x].iloc[0]['boundingBox'][0])
137
+ dictword = {col: column_data[col] for col in ordered_columns}
138
+
139
+ df = self._linklist.build_structure(dictword)
140
+ df = df.loc[:, ordered_columns]
141
+ df = df.rename(columns=lambda col: re.sub(r'__\d+__', '', str(col)).strip())
142
+ df_postp = self.postprocess(df)
143
+
144
+ # Assign generic column names
145
+ df.columns = [f"column {i+1}" for i in range(df.shape[1])]
146
+ table_data.append((df, df_postp))
147
+
148
+ return table_data[0], cords
src/table_creator/visualization.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple, Union
2
+ import cv2
3
+ import numpy as np
4
+ from PIL import Image
5
+
6
+ class TableVisualizer:
7
+ """
8
+ Utility class for visualizing detected tables and OCR results.
9
+ """
10
+
11
+ @staticmethod
12
+ def draw_boxes(
13
+ image: Union[np.ndarray, Image.Image],
14
+ boxes: List[List[int]],
15
+ color: Tuple[int, int, int] = (0, 255, 0),
16
+ thickness: int = 2
17
+ ) -> Image.Image:
18
+ """
19
+ Draw bounding boxes on an image.
20
+
21
+ Args:
22
+ image: Input image
23
+ boxes: List of bounding box coordinates [x1, y1, x2, y2]
24
+ color: RGB color for the boxes
25
+ thickness: Line thickness
26
+
27
+ Returns:
28
+ Image with drawn bounding boxes
29
+ """
30
+ if isinstance(image, Image.Image):
31
+ image = np.array(image)
32
+
33
+ if len(image.shape) == 2:
34
+ image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
35
+ elif image.shape[2] == 4:
36
+ image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
37
+
38
+ image_copy = image.copy()
39
+
40
+ for box in boxes:
41
+ cv2.rectangle(
42
+ image_copy,
43
+ (box[0], box[1]),
44
+ (box[2], box[3]),
45
+ color,
46
+ thickness
47
+ )
48
+
49
+ return Image.fromarray(image_copy)
50
+
51
+ @staticmethod
52
+ def draw_text_boxes(
53
+ image: Union[np.ndarray, Image.Image],
54
+ text_data: List[Tuple[str, List[int]]],
55
+ color: Tuple[int, int, int] = (255, 0, 0),
56
+ thickness: int = 1
57
+ ) -> Image.Image:
58
+ """
59
+ Draw text boxes with labels on an image.
60
+
61
+ Args:
62
+ image: Input image
63
+ text_data: List of (text, bbox) tuples
64
+ color: RGB color for the boxes
65
+ thickness: Line thickness
66
+
67
+ Returns:
68
+ Image with drawn text boxes
69
+ """
70
+ if isinstance(image, Image.Image):
71
+ image = np.array(image)
72
+
73
+ image_copy = image.copy()
74
+
75
+ for text, bbox in text_data:
76
+ cv2.rectangle(
77
+ image_copy,
78
+ (bbox[0], bbox[1]),
79
+ (bbox[2], bbox[3]),
80
+ color,
81
+ thickness
82
+ )
83
+ cv2.putText(
84
+ image_copy,
85
+ text[:20],
86
+ (bbox[0], bbox[1] - 5),
87
+ cv2.FONT_HERSHEY_SIMPLEX,
88
+ 0.5,
89
+ color,
90
+ thickness
91
+ )
92
+
93
+ return Image.fromarray(image_copy)