|
from dataclasses import dataclass |
|
from typing import Dict, List, Optional, Tuple |
|
import pandas as pd |
|
import numpy as np |
|
|
|
@dataclass |
|
class TableCell: |
|
""" |
|
Represents a cell in a table with its value and position. |
|
|
|
Attributes: |
|
value: The text content of the cell |
|
bbox: Bounding box coordinates [x1, y1, x2, y2] |
|
column_name: Name of the column this cell belongs to |
|
""" |
|
value: str |
|
bbox: List[int] |
|
column_name: str |
|
|
|
@dataclass |
|
class TableRow: |
|
""" |
|
Represents a row in a table with its cells and boundaries. |
|
|
|
Attributes: |
|
cells: Dictionary of column name to TableCell |
|
min_x: Minimum x coordinate of the row |
|
max_x: Maximum x coordinate of the row |
|
min_y: Minimum y coordinate of the row |
|
max_y: Maximum y coordinate of the row |
|
""" |
|
cells: Dict[str, TableCell] |
|
min_x: float |
|
max_x: float |
|
min_y: float |
|
max_y: float |
|
|
|
class TableStructure: |
|
""" |
|
Maintains the structure of a table using a linked list representation. |
|
""" |
|
|
|
def __init__(self, debug: bool = False) -> None: |
|
""" |
|
Initialize the table structure. |
|
|
|
Args: |
|
debug: Enable debug logging |
|
""" |
|
self.rows: List[TableRow] = [] |
|
self.debug = debug |
|
|
|
def build_structure(self, dataframes: Dict[str, pd.DataFrame]) -> pd.DataFrame: |
|
""" |
|
Build table structure from column-wise dataframes. |
|
|
|
Args: |
|
dataframes: Dictionary of column name to DataFrame containing text and positions |
|
|
|
Returns: |
|
DataFrame with structured table data |
|
""" |
|
if not dataframes: |
|
return pd.DataFrame() |
|
|
|
|
|
first_col = list(dataframes.keys())[0] |
|
self._initialize_rows(first_col, dataframes[first_col]) |
|
|
|
|
|
for col_name in list(dataframes.keys())[1:]: |
|
self._process_column(col_name, dataframes[col_name]) |
|
|
|
return self._to_dataframe(dataframes.keys()) |
|
|
|
def _initialize_rows(self, column_name: str, df: pd.DataFrame) -> None: |
|
"""Initialize rows with the first column's data.""" |
|
for _, row in df.iterrows(): |
|
bbox = row['boundingBox'] |
|
self.rows.append(TableRow( |
|
cells={column_name: TableCell(row['text'], bbox, column_name)}, |
|
min_x=bbox[0], |
|
max_x=bbox[2], |
|
min_y=bbox[1], |
|
max_y=bbox[3] |
|
)) |
|
|
|
def _process_column(self, column_name: str, df: pd.DataFrame) -> None: |
|
"""Process additional columns and align with existing rows.""" |
|
search_idx = 0 |
|
|
|
for _, row in df.iterrows(): |
|
text = row['text'] |
|
bbox = row['boundingBox'] |
|
|
|
matched = False |
|
for idx, table_row in enumerate(self.rows[search_idx:], search_idx): |
|
overlap = self._calculate_overlap( |
|
bbox, |
|
[bbox[0], table_row.min_y, bbox[2], table_row.max_y] |
|
) |
|
|
|
if overlap > 10: |
|
self._update_row(idx, column_name, text, bbox) |
|
search_idx = idx + 1 |
|
matched = True |
|
break |
|
elif bbox[3] <= table_row.min_y: |
|
self._insert_row(idx, column_name, text, bbox) |
|
search_idx = idx + 1 |
|
matched = True |
|
break |
|
|
|
if not matched and bbox[1] >= self.rows[-1].max_y: |
|
self._append_row(column_name, text, bbox) |
|
|
|
def _calculate_overlap(self, rect1: List[int], rect2: List[int]) -> float: |
|
"""Calculate percentage overlap between two rectangles.""" |
|
x_left = max(rect1[0], rect2[0]) |
|
y_top = max(rect1[1], rect2[1]) |
|
x_right = min(rect1[2], rect2[2]) |
|
y_bottom = min(rect1[3], rect2[3]) |
|
|
|
if x_right < x_left or y_bottom < y_top: |
|
return 0.0 |
|
|
|
intersection = (x_right - x_left) * (y_bottom - y_top) |
|
min_area = min( |
|
(rect1[2] - rect1[0]) * (rect1[3] - rect1[1]), |
|
(rect2[2] - rect2[0]) * (rect2[3] - rect2[1]) |
|
) |
|
|
|
return (intersection / min_area * 100) if min_area > 0 else 0 |
|
|
|
def _update_row(self, idx: int, column_name: str, text: str, bbox: List[int]) -> None: |
|
"""Update existing row with new cell data.""" |
|
self.rows[idx].cells[column_name] = TableCell(text, bbox, column_name) |
|
self.rows[idx].min_x = min(self.rows[idx].min_x, bbox[0]) |
|
self.rows[idx].max_x = max(self.rows[idx].max_x, bbox[2]) |
|
|
|
def _insert_row(self, idx: int, column_name: str, text: str, bbox: List[int]) -> None: |
|
"""Insert new row at specified index.""" |
|
self.rows.insert(idx, TableRow( |
|
cells={column_name: TableCell(text, bbox, column_name)}, |
|
min_x=bbox[0], |
|
max_x=bbox[2], |
|
min_y=bbox[1], |
|
max_y=bbox[3] |
|
)) |
|
|
|
def _append_row(self, column_name: str, text: str, bbox: List[int]) -> None: |
|
"""Append new row at the end.""" |
|
self.rows.append(TableRow( |
|
cells={column_name: TableCell(text, bbox, column_name)}, |
|
min_x=bbox[0], |
|
max_x=bbox[2], |
|
min_y=bbox[1], |
|
max_y=bbox[3] |
|
)) |
|
|
|
def _to_dataframe(self, columns: List[str]) -> pd.DataFrame: |
|
"""Convert table structure to DataFrame.""" |
|
data = [] |
|
for row in self.rows: |
|
row_data = { |
|
col: row.cells[col].value if col in row.cells else None |
|
for col in columns |
|
} |
|
row_data.update({ |
|
'row_min_x': row.min_x, |
|
'row_max_x': row.max_x, |
|
'row_min_y': row.min_y, |
|
'row_max_y': row.max_y |
|
}) |
|
data.append(row_data) |
|
|
|
return pd.DataFrame(data) |