File size: 6,069 Bytes
a7b8c18 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
import pandas as pd
import numpy as np
@dataclass
class TableCell:
"""
Represents a cell in a table with its value and position.
Attributes:
value: The text content of the cell
bbox: Bounding box coordinates [x1, y1, x2, y2]
column_name: Name of the column this cell belongs to
"""
value: str
bbox: List[int]
column_name: str
@dataclass
class TableRow:
"""
Represents a row in a table with its cells and boundaries.
Attributes:
cells: Dictionary of column name to TableCell
min_x: Minimum x coordinate of the row
max_x: Maximum x coordinate of the row
min_y: Minimum y coordinate of the row
max_y: Maximum y coordinate of the row
"""
cells: Dict[str, TableCell]
min_x: float
max_x: float
min_y: float
max_y: float
class TableStructure:
"""
Maintains the structure of a table using a linked list representation.
"""
def __init__(self, debug: bool = False) -> None:
"""
Initialize the table structure.
Args:
debug: Enable debug logging
"""
self.rows: List[TableRow] = []
self.debug = debug
def build_structure(self, dataframes: Dict[str, pd.DataFrame]) -> pd.DataFrame:
"""
Build table structure from column-wise dataframes.
Args:
dataframes: Dictionary of column name to DataFrame containing text and positions
Returns:
DataFrame with structured table data
"""
if not dataframes:
return pd.DataFrame()
# Initialize with first column
first_col = list(dataframes.keys())[0]
self._initialize_rows(first_col, dataframes[first_col])
# Process remaining columns
for col_name in list(dataframes.keys())[1:]:
self._process_column(col_name, dataframes[col_name])
return self._to_dataframe(dataframes.keys())
def _initialize_rows(self, column_name: str, df: pd.DataFrame) -> None:
"""Initialize rows with the first column's data."""
for _, row in df.iterrows():
bbox = row['boundingBox']
self.rows.append(TableRow(
cells={column_name: TableCell(row['text'], bbox, column_name)},
min_x=bbox[0],
max_x=bbox[2],
min_y=bbox[1],
max_y=bbox[3]
))
def _process_column(self, column_name: str, df: pd.DataFrame) -> None:
"""Process additional columns and align with existing rows."""
search_idx = 0
for _, row in df.iterrows():
text = row['text']
bbox = row['boundingBox']
matched = False
for idx, table_row in enumerate(self.rows[search_idx:], search_idx):
overlap = self._calculate_overlap(
bbox,
[bbox[0], table_row.min_y, bbox[2], table_row.max_y]
)
if overlap > 10:
self._update_row(idx, column_name, text, bbox)
search_idx = idx + 1
matched = True
break
elif bbox[3] <= table_row.min_y:
self._insert_row(idx, column_name, text, bbox)
search_idx = idx + 1
matched = True
break
if not matched and bbox[1] >= self.rows[-1].max_y:
self._append_row(column_name, text, bbox)
def _calculate_overlap(self, rect1: List[int], rect2: List[int]) -> float:
"""Calculate percentage overlap between two rectangles."""
x_left = max(rect1[0], rect2[0])
y_top = max(rect1[1], rect2[1])
x_right = min(rect1[2], rect2[2])
y_bottom = min(rect1[3], rect2[3])
if x_right < x_left or y_bottom < y_top:
return 0.0
intersection = (x_right - x_left) * (y_bottom - y_top)
min_area = min(
(rect1[2] - rect1[0]) * (rect1[3] - rect1[1]),
(rect2[2] - rect2[0]) * (rect2[3] - rect2[1])
)
return (intersection / min_area * 100) if min_area > 0 else 0
def _update_row(self, idx: int, column_name: str, text: str, bbox: List[int]) -> None:
"""Update existing row with new cell data."""
self.rows[idx].cells[column_name] = TableCell(text, bbox, column_name)
self.rows[idx].min_x = min(self.rows[idx].min_x, bbox[0])
self.rows[idx].max_x = max(self.rows[idx].max_x, bbox[2])
def _insert_row(self, idx: int, column_name: str, text: str, bbox: List[int]) -> None:
"""Insert new row at specified index."""
self.rows.insert(idx, TableRow(
cells={column_name: TableCell(text, bbox, column_name)},
min_x=bbox[0],
max_x=bbox[2],
min_y=bbox[1],
max_y=bbox[3]
))
def _append_row(self, column_name: str, text: str, bbox: List[int]) -> None:
"""Append new row at the end."""
self.rows.append(TableRow(
cells={column_name: TableCell(text, bbox, column_name)},
min_x=bbox[0],
max_x=bbox[2],
min_y=bbox[1],
max_y=bbox[3]
))
def _to_dataframe(self, columns: List[str]) -> pd.DataFrame:
"""Convert table structure to DataFrame."""
data = []
for row in self.rows:
row_data = {
col: row.cells[col].value if col in row.cells else None
for col in columns
}
row_data.update({
'row_min_x': row.min_x,
'row_max_x': row.max_x,
'row_min_y': row.min_y,
'row_max_y': row.max_y
})
data.append(row_data)
return pd.DataFrame(data) |