Update the files

Files changed (19) hide show

LICENSE +21 -0
images/image3.png +0 -0
images/image4.png +0 -0
requirements.txt +114 -0
src/__init__.py +0 -0
src/models/__init__.py +0 -0
src/models/__pycache__/__init__.cpython-312.pyc +0 -0
src/models/__pycache__/table_detector.cpython-312.pyc +0 -0
src/models/__pycache__/text_recognizer.cpython-312.pyc +0 -0
src/models/paddleocr_models/det/inference.pdiparams.info +0 -0
src/models/paddleocr_models/rec/inference.pdiparams.info +0 -0
src/models/table_detector.py +123 -0
src/models/text_recognizer.py +115 -0
src/streamlit_app.py +475 -0
src/table_creator/__pycache__/data_structures.cpython-312.pyc +0 -0
src/table_creator/__pycache__/table_extractor.cpython-312.pyc +0 -0
src/table_creator/data_structures.py +177 -0
src/table_creator/table_extractor.py +148 -0
src/table_creator/visualization.py +93 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Sudhanshu Pandey
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

images/image3.png ADDED Viewed

images/image4.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,114 @@

+albucore==0.0.13
+albumentations==1.4.10
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.8.0
+asgiref==3.8.1
+astor==0.8.1
+attrs==24.3.0
+beautifulsoup4==4.12.3
+blinker==1.9.0
+cachetools==5.5.0
+certifi==2024.12.14
+charset-normalizer==3.4.1
+click==8.1.8
+contourpy==1.3.1
+crispy-bootstrap4==2024.10
+cycler==0.12.1
+Cython==3.0.11
+decorator==5.1.1
+filelock==3.16.1
+filetype==1.2.0
+fire==0.7.0
+fonttools==4.55.3
+fsspec==2024.12.0
+gitdb==4.0.12
+GitPython==3.1.44
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.1
+hub-sdk==0.0.17
+huggingface-hub==0.27.1
+idna==3.7
+imageio==2.36.1
+imgaug==0.4.0
+Jinja2==3.1.5
+joblib==1.4.2
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+kiwisolver==1.4.8
+lazy_loader==0.4
+lmdb==1.6.2
+lxml==5.3.0
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib==3.10.0
+mdurl==0.1.2
+mpmath==1.3.0
+narwhals==1.22.0
+networkx==3.4.2
+numpy==1.26.4
+opencv-contrib-python==4.10.0.84
+opencv-python==4.10.0.84
+opencv-python-headless==4.10.0.84
+opt-einsum==3.3.0
+packaging==24.2
+paddleocr==2.9.1
+paddlepaddle==2.6.2
+pandas==2.2.3
+pillow==11.1.0
+protobuf==3.20.3
+psutil==6.1.1
+py-cpuinfo==9.0.0
+pyarrow==18.1.0
+pybboxes==0.1.6
+pyclipper==1.3.0.post6
+pydantic==2.10.5
+pydantic_core==2.27.2
+pydeck==0.9.1
+Pygments==2.19.1
+pyparsing==3.2.1
+python-dateutil==2.9.0.post0
+python-docx==1.1.2
+python-dotenv==1.0.1
+pytz==2024.2
+PyYAML==6.0.2
+RapidFuzz==3.11.0
+referencing==0.35.1
+requests==2.32.3
+requests-toolbelt==1.0.0
+rich==13.9.4
+roboflow==1.1.50
+rpds-py==0.22.3
+sahi==0.11.20
+scikit-image==0.25.0
+scikit-learn==1.6.1
+scipy==1.15.1
+seaborn==0.13.2
+setuptools==75.8.0
+shapely==2.0.6
+six==1.17.0
+smmap==5.0.2
+sniffio==1.3.1
+soupsieve==2.6
+sqlparse==0.5.3
+streamlit==1.41.1
+sympy==1.13.1
+tenacity==9.0.0
+termcolor==2.5.0
+terminaltables==3.1.10
+thop==0.1.1.post2209072238
+threadpoolctl==3.5.0
+tifffile==2025.1.10
+toml==0.10.2
+tomli==2.2.1
+torch==2.5.1
+torchvision==0.20.1
+tornado==6.4.2
+tqdm==4.67.1
+typing_extensions==4.12.2
+tzdata==2024.2
+ultralytics==8.3.65
+ultralytics-thop==2.0.14
+ultralyticsplus==0.1.0
+urllib3==2.3.0

src/__init__.py ADDED Viewed

File without changes

src/models/__init__.py ADDED Viewed

File without changes

src/models/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (170 Bytes). View file

src/models/__pycache__/table_detector.cpython-312.pyc ADDED Viewed

Binary file (5.76 kB). View file

src/models/__pycache__/text_recognizer.cpython-312.pyc ADDED Viewed

Binary file (6.64 kB). View file

src/models/paddleocr_models/det/inference.pdiparams.info ADDED Viewed

Binary file (26.4 kB). View file

src/models/paddleocr_models/rec/inference.pdiparams.info ADDED Viewed

Binary file (103 kB). View file

src/models/table_detector.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from pathlib import Path
+from typing import Optional, Union
+import numpy as np
+from ultralytics import YOLO
+# from ultralyticsplus import YOLO
+class TableDetector:
+    """
+    A class for detecting tables in document images using YOLO models.
+    Attributes:
+        model_path (Path): Path to the YOLO model weights
+        confidence (float): Confidence threshold for detection
+        iou_threshold (float): IoU threshold for NMS
+    """
+    def __init__(
+        self,
+        confidence: float = 0.50,
+        iou_threshold: float = 0.45
+    ) -> None:
+        """
+        Initialize the TableDetector with model and parameters.
+        Args:
+            model_path: Path to the YOLO model weights
+            confidence: Confidence threshold for detection
+            iou_threshold: IoU threshold for NMS
+        """
+        self.model_path = 'src/models/table-detection-and-extraction.pt'
+        self.model = YOLO(str(self.model_path))
+        self.min_conf = confidence
+        self.iou = iou_threshold
+    def detect(self, image_path: Union[str, Path]) -> Optional[np.ndarray]:
+        """
+        Detect tables in the given image.
+        Args:
+            image_path: Path to the input image
+        Returns:
+            Array of bounding box coordinates or None if no tables detected
+        """
+        results = self.model.predict(str(image_path), verbose=False, iou = self.iou, conf = self.min_conf)
+        if results:
+            print('boxes :\n',results[0])
+            boxes = results[0].boxes.xyxy.numpy()
+            cord =  self.merge_boxes(boxes)
+            print('cords : ',cord)
+            return [sorted(cord, key = lambda x : (x[2]-x[0])* (x[3]-x[1]), reverse=True)[0]] if len(cord) > 0 else []
+        return None
+    def merge_boxes(self, boxes: np.ndarray, overlap_threshold: float = 35) -> np.ndarray:
+        """
+        Merge overlapping bounding boxes.
+        Args:
+            boxes: Array of bounding box coordinates
+            overlap_threshold: Threshold for merging overlapping boxes
+        Returns:
+            Array of merged bounding box coordinates
+        """
+        # Sort boxes by area in descending order
+        areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+        sorted_indices = np.argsort(-areas)
+        boxes = boxes[sorted_indices]
+        merged_boxes = []
+        for box in boxes:
+            if not merged_boxes:
+                merged_boxes.append(box)
+                continue
+            overlap_found = False
+            for i, merged_box in enumerate(merged_boxes):
+                iou = self._calculate_overlap(box, merged_box)
+                if iou > overlap_threshold:
+                    # Keep the larger box
+                    box_area = (box[2] - box[0]) * (box[3] - box[1])
+                    merged_area = (merged_box[2] - merged_box[0]) * (merged_box[3] - merged_box[1])
+                    if box_area > merged_area:
+                        merged_boxes[i] = box
+                    overlap_found = True
+                    break
+            if not overlap_found:
+                merged_boxes.append(box)
+        return np.array(merged_boxes).astype(int)
+    @staticmethod
+    def _calculate_overlap(box1: np.ndarray, box2: np.ndarray) -> float:
+        """
+        Calculate the percentage overlap between two boxes.
+        Args:
+            box1: First bounding box coordinates
+            box2: Second bounding box coordinates
+        Returns:
+            Percentage of overlap between the boxes
+        """
+        x_left = max(box1[0], box2[0])
+        y_top = max(box1[1], box2[1])
+        x_right = min(box1[2], box2[2])
+        y_bottom = min(box1[3], box2[3])
+        if x_right < x_left or y_bottom < y_top:
+            return 0.0
+        intersection_area = (x_right - x_left) * (y_bottom - y_top)
+        box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
+        box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
+        min_area = min(box1_area, box2_area)
+        if min_area == 0:
+            return 0.0
+        return (intersection_area / min_area) * 100

src/models/text_recognizer.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from pathlib import Path
+from typing import List, Optional, Dict, Union
+import numpy as np
+import pandas as pd
+from paddleocr import PaddleOCR
+from PIL import Image
+class TextRecognizer:
+    """
+    A class for performing OCR on detected tables using PaddleOCR.
+    Attributes:
+        models_dir (Path): Directory containing OCR model files
+    """
+    def __init__(self, models_dir: Optional[Union[str, Path]] = None) -> None:
+        """
+        Initialize the TextRecognizer with model directory.
+        Args:
+            models_dir: Directory containing OCR model files
+        """
+        self.models_dir = Path(models_dir) if models_dir else Path(__file__).parent / 'paddleocr_models'
+        self._setup_model_dirs()
+        self.model = PaddleOCR(
+            use_angle_cls=False,
+            lang='en',
+            det_model_dir=str(self.models_dir / 'det'),
+            rec_model_dir=str(self.models_dir / 'rec')
+        )
+    def _setup_model_dirs(self) -> None:
+        """Create necessary directories for model files."""
+        (self.models_dir / 'det').mkdir(parents=True, exist_ok=True)
+        (self.models_dir / 'rec').mkdir(parents=True, exist_ok=True)
+    def recognize(
+        self,
+        image_path: Union[str, Path],
+        table_boxes: Optional[np.ndarray] = None,
+        padding: tuple = (0, 0)
+    ) -> List[pd.DataFrame]:
+        """
+        Perform OCR on the image within specified table regions.
+        Args:
+            image_path: Path to the input image
+            table_boxes: Array of table bounding box coordinates
+            padding: Padding to add around table regions (x, y)
+        Returns:
+            List of DataFrames containing extracted text and positions
+        """
+        with Image.open(image_path) as img:
+            img_array = np.array(img.convert('RGB'))
+        if table_boxes is not None and len(table_boxes) == 1:
+            pad_x, pad_y = padding
+            box = table_boxes[0]
+            img_array = img_array[
+                max(box[1]-pad_y, 0):box[3]+pad_y,
+                max(box[0]-pad_x, 0):box[2]+pad_x
+            ]
+        ocr_result = self.model.ocr(img_array)
+        if table_boxes is not None and len(table_boxes) > 1:
+            return self._process_multiple_tables(ocr_result[0], table_boxes)
+        return self._process_single_table(ocr_result[0])
+    def _process_multiple_tables(
+        self,
+        ocr_data: List,
+        table_boxes: np.ndarray
+    ) -> List[pd.DataFrame]:
+        """Process OCR results for multiple tables."""
+        result: Dict[int, List] = {}
+        for item in ocr_data:
+            bbox = np.array(item[0]).astype(int)
+            word = item[1][0]
+            bbox = [bbox[:,0].min(), bbox[:,1].min(), bbox[:,0].max(), bbox[:,1].max()]
+            for idx, table_box in enumerate(table_boxes):
+                if (bbox[0] >= table_box[0] and bbox[1] >= table_box[1] and
+                    bbox[0] <= table_box[2] and bbox[1] <= table_box[3]):
+                    if idx not in result:
+                        result[idx] = []
+                    result[idx].append((word, bbox))
+        return [
+            pd.DataFrame(
+                sorted(table_data, key=lambda x: (x[1][1], x[1][0])),
+                columns=['text', 'boundingBox']
+            )
+            for table_data in result.values()
+        ]
+    def _process_single_table(self, ocr_data: List) -> List[pd.DataFrame]:
+        """Process OCR results for a single table."""
+        processed_data = [
+            (item[1][0], [
+                np.array(item[0])[:,0].min(),
+                np.array(item[0])[:,1].min(),
+                np.array(item[0])[:,0].max(),
+                np.array(item[0])[:,1].max()
+            ])
+            for item in ocr_data
+        ]
+        return [pd.DataFrame(
+            sorted(processed_data, key=lambda x: (x[1][1], x[1][0])),
+            columns=['text', 'boundingBox']
+        )]

src/streamlit_app.py ADDED Viewed

	@@ -0,0 +1,475 @@

+from table_creator.table_extractor import TableExtraction
+import streamlit as st
+import base64
+from PIL import Image
+import os
+import cv2
+import numpy as np
+import tempfile
+import traceback
+# Load models only once
+if 'tab_ext' not in st.session_state:
+    st.session_state.tab_ext = TableExtraction()
+    print('Models loaded.')
+def process_image(imgpath):
+    return st.session_state.tab_ext.detect(imgpath)
+def draw_bounding_box(image, bbox):
+    """Draw a bounding box on the image"""
+    img_array = np.array(image)
+    if len(img_array.shape) == 3:
+        img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
+    x_min, y_min, x_max, y_max = bbox
+    cv2.rectangle(img_array, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
+    if len(img_array.shape) == 3:
+        img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)
+    return Image.fromarray(img_array)
+# Set page config
+st.set_page_config(
+    page_title="Table Extraction Tool",
+    layout="wide",
+    initial_sidebar_state="expanded"  # Changed to expanded to show guide by default
+)
+# Enhanced CSS styling with updated upload section
+st.markdown("""
+    <style>
+        /* Main container and background */
+        .main { padding: 1.5rem; }
+        .stApp {
+            background: linear-gradient(135deg, #f6f9fc 0%, #f0f4f8 100%);
+        }
+        /* Header styling */
+        .main-header {
+            background: linear-gradient(90deg, #1a365d 0%, #2563eb 100%);
+            color: white;
+            padding: 2rem 3rem;
+            border-radius: 15px;
+            margin-bottom: 2rem;
+            box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
+        }
+        .main-header h1 {
+            font-size: 2.5rem;
+            margin-bottom: 0.5rem;
+            font-weight: 600;
+            color: white;
+        }
+        .main-header p {
+            font-size: 1.1rem;
+            opacity: 0.9;
+        }
+        /* Card containers */
+        .content-card {
+            background-color: white;
+            padding: 1.5rem;
+            border-radius: 12px;
+            box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
+            border: 1px solid #e5e7eb;
+            margin-bottom: 1.5rem;
+        }
+        /* Upload section - Reduced size */
+        .upload-section {
+            text-align: center;
+            padding: 1rem;
+            border: 2px dashed #e5e7eb;
+            border-radius: 12px;
+            background-color: #f8fafc;
+            max-width: 600px;
+            margin: 0 auto;
+        }
+        .upload-icon {
+            font-size: 1.5rem;
+            color: #2563eb;
+            margin-bottom: 0.5rem;
+        }
+        /* Results section */
+        .results-header {
+            font-size: 1.25rem;
+            color: #1f2937;
+            margin-bottom: 1rem;
+            padding-bottom: 0.5rem;
+            border-bottom: 2px solid #e5e7eb;
+        }
+        /* Download buttons */
+        .download-button {
+            display: inline-block;
+            padding: 0.75rem 1.5rem;
+            background-color: #2563eb;
+            color: white;
+            text-decoration: none;
+            border-radius: 8px;
+            transition: all 0.2s;
+            text-align: center;
+            width: 100%;
+        }
+        .download-button:hover {
+            background-color: #1d4ed8;
+            box-shadow: 0 4px 6px -1px rgba(37, 99, 235, 0.2);
+        }
+        /* Tabs styling */
+        .stTabs [data-baseweb="tab-list"] {
+            gap: 1rem;
+            background-color: #f8fafc;
+            padding: 0.5rem;
+            border-radius: 8px;
+        }
+        .stTabs [data-baseweb="tab"] {
+            color: #4b5563;
+            font-weight: 500;
+            padding: 0.5rem 1.5rem;
+            border-radius: 6px;
+        }
+        .stTabs [data-baseweb="tab"][aria-selected="true"] {
+            background-color: #2563eb;
+            color: white;
+        }
+        /* Guide section styling */
+        .guide-section {
+            background-color: white;
+            padding: 2rem;
+            border-radius: 12px;
+            margin-bottom: 1.5rem;
+        }
+        .guide-header {
+            color: #1a365d;
+            font-size: 1.5rem;
+            margin-bottom: 1rem;
+            border-bottom: 2px solid #e5e7eb;
+            padding-bottom: 0.5rem;
+        }
+        .guide-subheader {
+            color: #2563eb;
+            font-size: 1.2rem;
+            margin: 1.5rem 0 0.5rem 0;
+        }
+        .guide-text {
+            color: #4b5563;
+            line-height: 1.6;
+            margin-bottom: 1rem;
+        }
+        .feature-card {
+            background-color: #f8fafc;
+            padding: 1rem;
+            border-radius: 8px;
+            margin-bottom: 1rem;
+            border-left: 4px solid #2563eb;
+        }
+        .step-container {
+            display: flex;
+            align-items: flex-start;
+            margin-bottom: 1rem;
+        }
+        .step-number {
+            background-color: #2563eb;
+            color: white;
+            width: 24px;
+            height: 24px;
+            border-radius: 12px;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            margin-right: 1rem;
+            flex-shrink: 0;
+        }
+        .info-icon {
+            color: #2563eb;
+            margin-right: 0.5rem;
+        }
+        .tech-details {
+            background-color: #f0f9ff;
+            padding: 1rem;
+            border-radius: 8px;
+            margin: 1rem 0;
+        }
+    </style>
+""", unsafe_allow_html=True)
+# Create sidebar with guide content
+with st.sidebar:
+    # st.markdown('<div class="guide-section">', unsafe_allow_html=True)
+    st.divider()
+    st.markdown('<h2 class="guide-header">📚 User Guide</h2>', unsafe_allow_html=True)
+    # How It Works section
+    st.markdown('<h3 class="guide-subheader">🎯 How It Works</h3>', unsafe_allow_html=True)
+    st.markdown("""
+        <div class="guide-text">
+            This tool uses advanced computer vision and machine learning techniques to:
+            <ul>
+                <li>Detect and locate tables in document images</li>
+                <li>Extract structured data from the detected tables</li>
+                <li>Convert the data into easily manageable formats</li>
+            </ul>
+        </div>
+    """, unsafe_allow_html=True)
+    # Usage Instructions
+    st.markdown('<h3 class="guide-subheader">📝 Usage Instructions</h3>', unsafe_allow_html=True)
+    st.markdown("""
+        <div class="step-container">
+            <div class="step-number">1</div>
+            <div class="guide-text">Upload a document image containing a table (PNG, JPG, or JPEG format)</div>
+        </div>
+        <div class="step-container">
+            <div class="step-number">2</div>
+            <div class="guide-text">The tool will automatically detect and highlight the table in your image</div>
+        </div>
+        <div class="step-container">
+            <div class="step-number">3</div>
+            <div class="guide-text">View both raw and enhanced versions of the extracted data</div>
+        </div>
+        <div class="step-container">
+            <div class="step-number">4</div>
+            <div class="guide-text">Download the results in CSV format for further use</div>
+        </div>
+    """, unsafe_allow_html=True)
+    # Best Practices
+    st.markdown('<h3 class="guide-subheader">💡 Best Practices</h3>', unsafe_allow_html=True)
+    st.markdown("""
+        <div class="feature-card">
+            <strong>For Best Results:</strong>
+            <ul>
+                <li>Use clear, high-resolution images</li>
+                <li>Ensure tables have well-defined borders</li>
+                <li>Avoid skewed or rotated images</li>
+                <li>Make sure text is clearly readable</li>
+            </ul>
+        </div>
+    """, unsafe_allow_html=True)
+    # Technical Details (collapsible)
+    with st.expander("🔧 Technical Details"):
+        st.markdown("""
+            <div class="tech-details">
+                <p><strong>Algorithm Overview:</strong></p>
+                <ul>
+                    <li>Uses computer vision for table boundary detection</li>
+                    <li>Employs OCR (Optical Character Recognition) for text extraction</li>
+                    <li>Implements intelligent cell segmentation</li>
+                    <li>Applies post-processing for enhanced accuracy</li>
+                </ul>
+            </div>
+        """, unsafe_allow_html=True)
+    # Support Info
+    st.markdown('<h3 class="guide-subheader">🔗 Connect with Me</h3>', unsafe_allow_html=True)
+    st.markdown("""
+        <div class="guide-text" style="font-size: 1rem;">
+            If you encounter any issues or have questions, feel free to reach out:
+            <a href="https://github.com/Sudhanshu1304" target="_blank" style="text-decoration: none;">
+                <img src="https://img.icons8.com/ios-filled/20/000000/github.png" alt="GitHub" style="vertical-align: middle; margin-right: 5px;"/>
+                GitHub
+            </a> |
+            <a href="https://www.linkedin.com/in/sudhanshu-pandey-847448193/" target="_blank" style="text-decoration: none;">
+                <img src="https://img.icons8.com/ios-filled/20/000000/linkedin.png" alt="LinkedIn" style="vertical-align: middle; margin-right: 5px;"/>
+                LinkedIn
+            </a> |
+            <a href="https://medium.com/@sudhanshu.dpandey" target="_blank" style="text-decoration: none;">
+                <img src="https://img.icons8.com/ios-filled/20/000000/medium-logo.png" alt="Medium" style="vertical-align: middle; margin-right: 5px;"/>
+                Medium
+            </a>
+        </div>
+    """, unsafe_allow_html=True)
+# Initialize session state for expanded view
+if 'is_expanded' not in st.session_state:
+    st.session_state.is_expanded = False
+# Title and description
+st.markdown("""
+    <div class="main-header">
+        <h1>📊 Table Extraction Tool</h1>
+        <p>Upload an image containing tables and instantly convert them into structured data formats.</p>
+    </div>
+""", unsafe_allow_html=True)
+# File upload section - Reduced size
+# st.markdown('<div class="content-card">', unsafe_allow_html=True)
+# st.markdown("""
+#     <div class="upload-section">
+#         <div class="upload-icon">📥</div>
+#         <h3 style="font-size: 1.1rem; margin: 0.5rem 0;">Upload Table Image</h3>
+#         <p style="font-size: 0.9rem; margin: 0;">Supported formats: PNG, JPG, JPEG</p>
+#     </div>
+# """, unsafe_allow_html=True)
+uploaded_file = st.file_uploader("", type=['png', 'jpg', 'jpeg'])
+st.markdown('</div>', unsafe_allow_html=True)
+# Process the uploaded file
+if uploaded_file is not None:
+    with st.spinner('🔄 Processing your image...'):
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
+            tmp_file.write(uploaded_file.getvalue())
+            temp_path = tmp_file.name
+        try:
+            image = Image.open(uploaded_file)
+            (raw_df, cleaned_df), bbox = process_image(temp_path)
+            st.session_state.raw_data = raw_df
+            st.session_state.processed_data = cleaned_df
+            marked_image = draw_bounding_box(image, bbox[0])
+            st.session_state.marked_image = marked_image
+            # Side by side layout
+            col1, col2 = st.columns([0.4, 0.6])
+            with col1:
+                # st.markdown('<div class="content-card image-container">', unsafe_allow_html=True)
+                st.divider()
+                st.markdown('<h3 class="results-header">Detected Table</h3>', unsafe_allow_html=True)
+                st.image(marked_image, use_container_width=True)
+                st.markdown('</div>', unsafe_allow_html=True)
+            with col2:
+                # st.markdown('<div class="content-card">', unsafe_allow_html=True)
+                st.divider()
+                st.markdown('<h3 class="results-header">Extracted Data</h3>', unsafe_allow_html=True)
+                # # Toggle button for expanded view
+                # if st.button("🔍 Toggle Full View" if not st.session_state.is_expanded else "⬆️ Collapse View"):
+                #     st.session_state.is_expanded = not st.session_state.is_expanded
+                tabs = st.tabs(["🔍 Raw Data", "✨ Enhanced Data ⭐"])
+                with tabs[0]:
+                    st.dataframe(st.session_state.raw_data,
+                               use_container_width=True,
+                               height=600 if not st.session_state.is_expanded else None)
+                    # Add HTML copy section for raw data
+                    st.markdown("### 📋 Copy HTML Table")
+                    html_raw = st.session_state.raw_data.to_html(index=False)
+                    st.markdown("""
+                        <div style="background-color: #f8fafc; padding: 0.5rem; border-radius: 8px; margin-bottom: 0.5rem;">
+                            <p style="margin: 0; color: #475569; font-size: 0.9rem;">
+                                ℹ️ This HTML can be copied and used directly in websites, LLM prompts, or other applications.
+                            </p>
+                        </div>
+                    """, unsafe_allow_html=True)
+                    st.markdown("""
+                        <div style="max-height: 150px; overflow-y: auto; border-radius: 8px;">
+                    """, unsafe_allow_html=True)
+                    st.code(html_raw, language="html")
+                    st.markdown("</div>", unsafe_allow_html=True)
+                with tabs[1]:
+                    st.markdown("""
+                        <div style="background-color: #f0f9ff; padding: 1rem; border-radius: 8px; margin-bottom: 1rem;">
+                            <p style="margin: 0; color: #1e40af;">
+                                ⭐ This is our enhanced version of the table with improved formatting and structure.
+                            </p>
+                        </div>
+                    """, unsafe_allow_html=True)
+                    st.dataframe(st.session_state.processed_data,
+                               use_container_width=True,
+                               height=600 if not st.session_state.is_expanded else None)
+                    # Add HTML copy section for enhanced data
+                    st.markdown("### 📋 Copy HTML Table")
+                    html_enhanced = st.session_state.processed_data.to_html(index=False)
+                    st.markdown("""
+                        <div style="background-color: #f8fafc; padding: 0.5rem; border-radius: 8px; margin-bottom: 0.5rem;">
+                            <p style="margin: 0; color: #475569; font-size: 0.9rem;">
+                                ℹ️ This HTML can be copied and used directly in websites, LLM prompts, or other applications.
+                            </p>
+                        </div>
+                    """, unsafe_allow_html=True)
+                    st.markdown("""
+                        <div style="max-height: 150px; overflow-y: auto; border-radius: 8px;">
+                    """, unsafe_allow_html=True)
+                    st.code(html_enhanced, language="html")
+                    st.markdown("</div>", unsafe_allow_html=True)
+                # st.markdown('</div>', unsafe_allow_html=True)
+            # Download section below both columns
+            # st.markdown('<div class="content-card">', unsafe_allow_html=True)
+            # Download section below both columns
+            st.divider()
+            st.markdown('<h3 class="results-header">Download Options</h3>', unsafe_allow_html=True)
+            download_cols = st.columns([1, 0.1, 1])
+            def get_csv_download_link(df, filename):
+                csv = df.to_csv(index=False).encode()
+                b64 = base64.b64encode(csv).decode()
+                return f'<a href="data:file/csv;base64,{b64}" download="{filename}" class="download-button">📥 Download {filename}</a>'
+            with download_cols[0]:
+                if 'raw_data' in st.session_state:
+                    csv = st.session_state.raw_data.to_csv(index=False)
+                    st.download_button(
+                        label="📥 Download Raw Data",
+                        data=csv,
+                        file_name="raw_data.csv",
+                        mime="text/csv",
+                        use_container_width=True,
+                        key="raw_download"
+                    )
+            with download_cols[2]:
+                if 'processed_data' in st.session_state:
+                    csv = st.session_state.processed_data.to_csv(index=False)
+                    st.download_button(
+                        label="📥 Download Enhanced Data ⭐",
+                        data=csv,
+                        file_name="enhanced_data.csv",
+                        mime="text/csv",
+                        use_container_width=True,
+                        key="enhanced_download"
+                    )
+            st.markdown('</div>', unsafe_allow_html=True)
+        except Exception as e:
+            st.error(f"❌ Error processing image: {str(traceback.format_exc())}")
+        finally:
+            try:
+                os.unlink(temp_path)
+            except Exception as e:
+                st.warning(f"⚠️ Error removing temporary file: {str(e)}")

src/table_creator/__pycache__/data_structures.cpython-312.pyc ADDED Viewed

Binary file (8.74 kB). View file

src/table_creator/__pycache__/table_extractor.cpython-312.pyc ADDED Viewed

Binary file (9.27 kB). View file

src/table_creator/data_structures.py ADDED Viewed

	@@ -0,0 +1,177 @@

+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+import pandas as pd
+import numpy as np
+@dataclass
+class TableCell:
+    """
+    Represents a cell in a table with its value and position.
+    Attributes:
+        value: The text content of the cell
+        bbox: Bounding box coordinates [x1, y1, x2, y2]
+        column_name: Name of the column this cell belongs to
+    """
+    value: str
+    bbox: List[int]
+    column_name: str
+@dataclass
+class TableRow:
+    """
+    Represents a row in a table with its cells and boundaries.
+    Attributes:
+        cells: Dictionary of column name to TableCell
+        min_x: Minimum x coordinate of the row
+        max_x: Maximum x coordinate of the row
+        min_y: Minimum y coordinate of the row
+        max_y: Maximum y coordinate of the row
+    """
+    cells: Dict[str, TableCell]
+    min_x: float
+    max_x: float
+    min_y: float
+    max_y: float
+class TableStructure:
+    """
+    Maintains the structure of a table using a linked list representation.
+    """
+    def __init__(self, debug: bool = False) -> None:
+        """
+        Initialize the table structure.
+        Args:
+            debug: Enable debug logging
+        """
+        self.rows: List[TableRow] = []
+        self.debug = debug
+    def build_structure(self, dataframes: Dict[str, pd.DataFrame]) -> pd.DataFrame:
+        """
+        Build table structure from column-wise dataframes.
+        Args:
+            dataframes: Dictionary of column name to DataFrame containing text and positions
+        Returns:
+            DataFrame with structured table data
+        """
+        if not dataframes:
+            return pd.DataFrame()
+        # Initialize with first column
+        first_col = list(dataframes.keys())[0]
+        self._initialize_rows(first_col, dataframes[first_col])
+        # Process remaining columns
+        for col_name in list(dataframes.keys())[1:]:
+            self._process_column(col_name, dataframes[col_name])
+        return self._to_dataframe(dataframes.keys())
+    def _initialize_rows(self, column_name: str, df: pd.DataFrame) -> None:
+        """Initialize rows with the first column's data."""
+        for _, row in df.iterrows():
+            bbox = row['boundingBox']
+            self.rows.append(TableRow(
+                cells={column_name: TableCell(row['text'], bbox, column_name)},
+                min_x=bbox[0],
+                max_x=bbox[2],
+                min_y=bbox[1],
+                max_y=bbox[3]
+            ))
+    def _process_column(self, column_name: str, df: pd.DataFrame) -> None:
+        """Process additional columns and align with existing rows."""
+        search_idx = 0
+        for _, row in df.iterrows():
+            text = row['text']
+            bbox = row['boundingBox']
+            matched = False
+            for idx, table_row in enumerate(self.rows[search_idx:], search_idx):
+                overlap = self._calculate_overlap(
+                    bbox,
+                    [bbox[0], table_row.min_y, bbox[2], table_row.max_y]
+                )
+                if overlap > 10:
+                    self._update_row(idx, column_name, text, bbox)
+                    search_idx = idx + 1
+                    matched = True
+                    break
+                elif bbox[3] <= table_row.min_y:
+                    self._insert_row(idx, column_name, text, bbox)
+                    search_idx = idx + 1
+                    matched = True
+                    break
+            if not matched and bbox[1] >= self.rows[-1].max_y:
+                self._append_row(column_name, text, bbox)
+    def _calculate_overlap(self, rect1: List[int], rect2: List[int]) -> float:
+        """Calculate percentage overlap between two rectangles."""
+        x_left = max(rect1[0], rect2[0])
+        y_top = max(rect1[1], rect2[1])
+        x_right = min(rect1[2], rect2[2])
+        y_bottom = min(rect1[3], rect2[3])
+        if x_right < x_left or y_bottom < y_top:
+            return 0.0
+        intersection = (x_right - x_left) * (y_bottom - y_top)
+        min_area = min(
+            (rect1[2] - rect1[0]) * (rect1[3] - rect1[1]),
+            (rect2[2] - rect2[0]) * (rect2[3] - rect2[1])
+        )
+        return (intersection / min_area * 100) if min_area > 0 else 0
+    def _update_row(self, idx: int, column_name: str, text: str, bbox: List[int]) -> None:
+        """Update existing row with new cell data."""
+        self.rows[idx].cells[column_name] = TableCell(text, bbox, column_name)
+        self.rows[idx].min_x = min(self.rows[idx].min_x, bbox[0])
+        self.rows[idx].max_x = max(self.rows[idx].max_x, bbox[2])
+    def _insert_row(self, idx: int, column_name: str, text: str, bbox: List[int]) -> None:
+        """Insert new row at specified index."""
+        self.rows.insert(idx, TableRow(
+            cells={column_name: TableCell(text, bbox, column_name)},
+            min_x=bbox[0],
+            max_x=bbox[2],
+            min_y=bbox[1],
+            max_y=bbox[3]
+        ))
+    def _append_row(self, column_name: str, text: str, bbox: List[int]) -> None:
+        """Append new row at the end."""
+        self.rows.append(TableRow(
+            cells={column_name: TableCell(text, bbox, column_name)},
+            min_x=bbox[0],
+            max_x=bbox[2],
+            min_y=bbox[1],
+            max_y=bbox[3]
+        ))
+    def _to_dataframe(self, columns: List[str]) -> pd.DataFrame:
+        """Convert table structure to DataFrame."""
+        data = []
+        for row in self.rows:
+            row_data = {
+                col: row.cells[col].value if col in row.cells else None
+                for col in columns
+            }
+            row_data.update({
+                'row_min_x': row.min_x,
+                'row_max_x': row.max_x,
+                'row_min_y': row.min_y,
+                'row_max_y': row.max_y
+            })
+            data.append(row_data)
+        return pd.DataFrame(data)

src/table_creator/table_extractor.py ADDED Viewed

	@@ -0,0 +1,148 @@

+from models.table_detector import TableDetector
+from models.text_recognizer import TextRecognizer
+from table_creator.data_structures import TableStructure
+import pandas as pd
+import re
+class TableExtraction:
+    def __init__(self) -> None:
+        self._table_detection = TableDetector()
+        self._document_ocr = TextRecognizer()
+        self._linklist = TableStructure()
+    def _merge_words(self, prev_obj, word, word_bb):
+        """Merge the current word with the previous one if they overlap significantly."""
+        merged_text = prev_obj[0] + ' ' + word
+        merged_bb = [
+            prev_obj[1][0], prev_obj[1][1], word_bb[2], word_bb[3]
+        ]
+        return (merged_text, merged_bb)
+    def _assign_to_column(self, word, word_bb, columns, df, debug=False):
+        """Assign a word to the correct column based on bounding box overlap."""
+        for key, col_bb in columns.items():
+            word_bb_temp = [word_bb[0], col_bb[1], word_bb[2], col_bb[3]]
+            overlap = self._table_detection._calculate_overlap(word_bb_temp, col_bb)
+            if overlap > 10:
+                if len(df[key]) > 0:
+                    prev_obj = df[key][-1]
+                    prev_overlap = self._table_detection._calculate_overlap(
+                        prev_obj[1], [prev_obj[1][0], word_bb[1], prev_obj[1][2], word_bb[3]]
+                    )
+                    if prev_overlap >= 30:
+                        word, word_bb = self._merge_words(prev_obj, word, word_bb)
+                        df[key][-1] = (word, word_bb)
+                    else:
+                        df[key].append((word, word_bb))
+                else:
+                    df[key].append((word, word_bb))
+                    # Dynamically adjust the column bounding box to fit the new word
+                    columns[key] = [
+                        min(word_bb[0], col_bb[0]), col_bb[1],
+                        max(word_bb[2], col_bb[2]), col_bb[3]
+                    ]
+                return True
+        return False
+    def _get_normalized_bounding_box(self, imgsz : str, bb : list) -> pd.DataFrame:
+        names = ['pdf1','sample_pdf2.pdf']
+        pass
+    def get_words_in_column(self, cords: dict, df_word: pd.DataFrame, merge=True, debug=False):
+        """Distribute words into their respective columns based on bounding box coordinates."""
+        df = {key: [] for key in cords}
+        unknown_columns = {}
+        unknown_data = {}
+        for index, row in df_word.iterrows():
+            word, word_bb = row['text'], list(map(int, row['boundingBox']))
+            if debug:
+                print(f"\nProcessing word: '{word}'")
+            if not self._assign_to_column(word, word_bb, cords, df, debug):
+                # Handle words that do not match any known column
+                for key, val in unknown_columns.items():
+                    overlap = self._table_detection._calculate_overlap(
+                        val, [word_bb[0], val[1], word_bb[2], val[3]]
+                    )
+                    if overlap > 30:
+                        prev_obj = unknown_data[key][-1]
+                        prev_overlap = self._table_detection._calculate_overlap(
+                            prev_obj[1], [prev_obj[1][0], word_bb[1], prev_obj[1][2], word_bb[3]]
+                        )
+                        if prev_overlap >= 30:
+                            word, word_bb = self._merge_words(prev_obj, word, word_bb)
+                            unknown_data[key][-1] = (word, word_bb)
+                        else:
+                            unknown_data[key].append((word, word_bb))
+                        break
+                else:
+                    # Create a new unknown column if no match is found
+                    unknown_key = f'{word}__{index}__'
+                    unknown_columns[unknown_key] = word_bb
+                    unknown_data[unknown_key] = [(word, word_bb)]
+        if merge:
+            df.update(unknown_data)
+        # Convert lists to DataFrames
+        df = {key: pd.DataFrame(val, columns=['text', 'boundingBox']) for key, val in df.items()}
+        return df, unknown_data, unknown_columns
+    def postprocess(self, parsed_df: pd.DataFrame, columns=None):
+        """Post-process the parsed DataFrame to merge columns and clean data."""
+        try:
+            parsed_df = parsed_df.dropna(how='all').reset_index(drop=True)
+            new_df = pd.DataFrame()
+            # Merge adjacent empty header columns
+            empty_columns = parsed_df.columns[parsed_df.iloc[:1].isna().all()].tolist()
+            for col in empty_columns[::-1]:
+                col_idx = list(parsed_df.columns).index(col)
+                if col_idx > 0:
+                    parsed_df.iloc[:, col_idx - 1] += ' ' + parsed_df.iloc[:, col_idx]
+            parsed_df = parsed_df.drop(columns=empty_columns)
+            if not columns:
+                return parsed_df
+            used_indices = set()
+            for header in columns:
+                match_indices = [i for i, col in enumerate(parsed_df.columns) if header in col]
+                if match_indices:
+                    used_indices.update(match_indices)
+                    new_df[header] = parsed_df.iloc[:, match_indices].apply(
+                        lambda x: ' '.join(x.fillna('').str.strip()), axis=1
+                    )
+            # Include unused columns
+            unused_columns = [col for i, col in enumerate(parsed_df.columns) if i not in used_indices]
+            new_df = pd.concat([new_df, parsed_df[unused_columns]], axis=1)
+            return new_df
+        except Exception as e:
+            print(f"Error in postprocess: {e}")
+            return parsed_df
+    def detect(self, image_path: str):
+        """Detect tables in an image and extract their data."""
+        cords = self._table_detection.detect(image_path)
+        all_table_df = self._document_ocr.recognize(image_path, cords)
+        table_data = []
+        for table in all_table_df:
+            column_data, _, _ = self.get_words_in_column({}, table)
+            ordered_columns = sorted(column_data, key=lambda x: column_data[x].iloc[0]['boundingBox'][0])
+            dictword = {col: column_data[col] for col in ordered_columns}
+            df = self._linklist.build_structure(dictword)
+            df = df.loc[:, ordered_columns]
+            df = df.rename(columns=lambda col: re.sub(r'__\d+__', '', str(col)).strip())
+            df_postp = self.postprocess(df)
+            # Assign generic column names
+            df.columns = [f"column {i+1}" for i in range(df.shape[1])]
+            table_data.append((df, df_postp))
+        return table_data[0], cords

src/table_creator/visualization.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from typing import List, Tuple, Union
+import cv2
+import numpy as np
+from PIL import Image
+class TableVisualizer:
+    """
+    Utility class for visualizing detected tables and OCR results.
+    """
+    @staticmethod
+    def draw_boxes(
+        image: Union[np.ndarray, Image.Image],
+        boxes: List[List[int]],
+        color: Tuple[int, int, int] = (0, 255, 0),
+        thickness: int = 2
+    ) -> Image.Image:
+        """
+        Draw bounding boxes on an image.
+        Args:
+            image: Input image
+            boxes: List of bounding box coordinates [x1, y1, x2, y2]
+            color: RGB color for the boxes
+            thickness: Line thickness
+        Returns:
+            Image with drawn bounding boxes
+        """
+        if isinstance(image, Image.Image):
+            image = np.array(image)
+        if len(image.shape) == 2:
+            image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
+        elif image.shape[2] == 4:
+            image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
+        image_copy = image.copy()
+        for box in boxes:
+            cv2.rectangle(
+                image_copy,
+                (box[0], box[1]),
+                (box[2], box[3]),
+                color,
+                thickness
+            )
+        return Image.fromarray(image_copy)
+    @staticmethod
+    def draw_text_boxes(
+        image: Union[np.ndarray, Image.Image],
+        text_data: List[Tuple[str, List[int]]],
+        color: Tuple[int, int, int] = (255, 0, 0),
+        thickness: int = 1
+    ) -> Image.Image:
+        """
+        Draw text boxes with labels on an image.
+        Args:
+            image: Input image
+            text_data: List of (text, bbox) tuples
+            color: RGB color for the boxes
+            thickness: Line thickness
+        Returns:
+            Image with drawn text boxes
+        """
+        if isinstance(image, Image.Image):
+            image = np.array(image)
+        image_copy = image.copy()
+        for text, bbox in text_data:
+            cv2.rectangle(
+                image_copy,
+                (bbox[0], bbox[1]),
+                (bbox[2], bbox[3]),
+                color,
+                thickness
+            )
+            cv2.putText(
+                image_copy,
+                text[:20],
+                (bbox[0], bbox[1] - 5),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.5,
+                color,
+                thickness
+            )
+        return Image.fromarray(image_copy)