File size: 6,069 Bytes
a7b8c18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
import pandas as pd
import numpy as np

@dataclass
class TableCell:
    """
    Represents a cell in a table with its value and position.
    
    Attributes:
        value: The text content of the cell
        bbox: Bounding box coordinates [x1, y1, x2, y2]
        column_name: Name of the column this cell belongs to
    """
    value: str
    bbox: List[int]
    column_name: str

@dataclass
class TableRow:
    """
    Represents a row in a table with its cells and boundaries.
    
    Attributes:
        cells: Dictionary of column name to TableCell
        min_x: Minimum x coordinate of the row
        max_x: Maximum x coordinate of the row
        min_y: Minimum y coordinate of the row
        max_y: Maximum y coordinate of the row
    """
    cells: Dict[str, TableCell]
    min_x: float
    max_x: float
    min_y: float
    max_y: float

class TableStructure:
    """
    Maintains the structure of a table using a linked list representation.
    """
    
    def __init__(self, debug: bool = False) -> None:
        """
        Initialize the table structure.
        
        Args:
            debug: Enable debug logging
        """
        self.rows: List[TableRow] = []
        self.debug = debug

    def build_structure(self, dataframes: Dict[str, pd.DataFrame]) -> pd.DataFrame:
        """
        Build table structure from column-wise dataframes.
        
        Args:
            dataframes: Dictionary of column name to DataFrame containing text and positions
            
        Returns:
            DataFrame with structured table data
        """
        if not dataframes:
            return pd.DataFrame()

        # Initialize with first column
        first_col = list(dataframes.keys())[0]
        self._initialize_rows(first_col, dataframes[first_col])
        
        # Process remaining columns
        for col_name in list(dataframes.keys())[1:]:
            self._process_column(col_name, dataframes[col_name])
            
        return self._to_dataframe(dataframes.keys())

    def _initialize_rows(self, column_name: str, df: pd.DataFrame) -> None:
        """Initialize rows with the first column's data."""
        for _, row in df.iterrows():
            bbox = row['boundingBox']
            self.rows.append(TableRow(
                cells={column_name: TableCell(row['text'], bbox, column_name)},
                min_x=bbox[0],
                max_x=bbox[2],
                min_y=bbox[1],
                max_y=bbox[3]
            ))

    def _process_column(self, column_name: str, df: pd.DataFrame) -> None:
        """Process additional columns and align with existing rows."""
        search_idx = 0
        
        for _, row in df.iterrows():
            text = row['text']
            bbox = row['boundingBox']
            
            matched = False
            for idx, table_row in enumerate(self.rows[search_idx:], search_idx):
                overlap = self._calculate_overlap(
                    bbox,
                    [bbox[0], table_row.min_y, bbox[2], table_row.max_y]
                )
                
                if overlap > 10:
                    self._update_row(idx, column_name, text, bbox)
                    search_idx = idx + 1
                    matched = True
                    break
                elif bbox[3] <= table_row.min_y:
                    self._insert_row(idx, column_name, text, bbox)
                    search_idx = idx + 1
                    matched = True
                    break
                
            if not matched and bbox[1] >= self.rows[-1].max_y:
                self._append_row(column_name, text, bbox)

    def _calculate_overlap(self, rect1: List[int], rect2: List[int]) -> float:
        """Calculate percentage overlap between two rectangles."""
        x_left = max(rect1[0], rect2[0])
        y_top = max(rect1[1], rect2[1])
        x_right = min(rect1[2], rect2[2])
        y_bottom = min(rect1[3], rect2[3])

        if x_right < x_left or y_bottom < y_top:
            return 0.0

        intersection = (x_right - x_left) * (y_bottom - y_top)
        min_area = min(
            (rect1[2] - rect1[0]) * (rect1[3] - rect1[1]),
            (rect2[2] - rect2[0]) * (rect2[3] - rect2[1])
        )
        
        return (intersection / min_area * 100) if min_area > 0 else 0

    def _update_row(self, idx: int, column_name: str, text: str, bbox: List[int]) -> None:
        """Update existing row with new cell data."""
        self.rows[idx].cells[column_name] = TableCell(text, bbox, column_name)
        self.rows[idx].min_x = min(self.rows[idx].min_x, bbox[0])
        self.rows[idx].max_x = max(self.rows[idx].max_x, bbox[2])

    def _insert_row(self, idx: int, column_name: str, text: str, bbox: List[int]) -> None:
        """Insert new row at specified index."""
        self.rows.insert(idx, TableRow(
            cells={column_name: TableCell(text, bbox, column_name)},
            min_x=bbox[0],
            max_x=bbox[2],
            min_y=bbox[1],
            max_y=bbox[3]
        ))

    def _append_row(self, column_name: str, text: str, bbox: List[int]) -> None:
        """Append new row at the end."""
        self.rows.append(TableRow(
            cells={column_name: TableCell(text, bbox, column_name)},
            min_x=bbox[0],
            max_x=bbox[2],
            min_y=bbox[1],
            max_y=bbox[3]
        ))

    def _to_dataframe(self, columns: List[str]) -> pd.DataFrame:
        """Convert table structure to DataFrame."""
        data = []
        for row in self.rows:
            row_data = {
                col: row.cells[col].value if col in row.cells else None
                for col in columns
            }
            row_data.update({
                'row_min_x': row.min_x,
                'row_max_x': row.max_x,
                'row_min_y': row.min_y,
                'row_max_y': row.max_y
            })
            data.append(row_data)
            
        return pd.DataFrame(data)