table-extraction / src /table_creator /data_structures.py
Sudhanshu Pandey
Update the files
a7b8c18
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
import pandas as pd
import numpy as np
@dataclass
class TableCell:
"""
Represents a cell in a table with its value and position.
Attributes:
value: The text content of the cell
bbox: Bounding box coordinates [x1, y1, x2, y2]
column_name: Name of the column this cell belongs to
"""
value: str
bbox: List[int]
column_name: str
@dataclass
class TableRow:
"""
Represents a row in a table with its cells and boundaries.
Attributes:
cells: Dictionary of column name to TableCell
min_x: Minimum x coordinate of the row
max_x: Maximum x coordinate of the row
min_y: Minimum y coordinate of the row
max_y: Maximum y coordinate of the row
"""
cells: Dict[str, TableCell]
min_x: float
max_x: float
min_y: float
max_y: float
class TableStructure:
"""
Maintains the structure of a table using a linked list representation.
"""
def __init__(self, debug: bool = False) -> None:
"""
Initialize the table structure.
Args:
debug: Enable debug logging
"""
self.rows: List[TableRow] = []
self.debug = debug
def build_structure(self, dataframes: Dict[str, pd.DataFrame]) -> pd.DataFrame:
"""
Build table structure from column-wise dataframes.
Args:
dataframes: Dictionary of column name to DataFrame containing text and positions
Returns:
DataFrame with structured table data
"""
if not dataframes:
return pd.DataFrame()
# Initialize with first column
first_col = list(dataframes.keys())[0]
self._initialize_rows(first_col, dataframes[first_col])
# Process remaining columns
for col_name in list(dataframes.keys())[1:]:
self._process_column(col_name, dataframes[col_name])
return self._to_dataframe(dataframes.keys())
def _initialize_rows(self, column_name: str, df: pd.DataFrame) -> None:
"""Initialize rows with the first column's data."""
for _, row in df.iterrows():
bbox = row['boundingBox']
self.rows.append(TableRow(
cells={column_name: TableCell(row['text'], bbox, column_name)},
min_x=bbox[0],
max_x=bbox[2],
min_y=bbox[1],
max_y=bbox[3]
))
def _process_column(self, column_name: str, df: pd.DataFrame) -> None:
"""Process additional columns and align with existing rows."""
search_idx = 0
for _, row in df.iterrows():
text = row['text']
bbox = row['boundingBox']
matched = False
for idx, table_row in enumerate(self.rows[search_idx:], search_idx):
overlap = self._calculate_overlap(
bbox,
[bbox[0], table_row.min_y, bbox[2], table_row.max_y]
)
if overlap > 10:
self._update_row(idx, column_name, text, bbox)
search_idx = idx + 1
matched = True
break
elif bbox[3] <= table_row.min_y:
self._insert_row(idx, column_name, text, bbox)
search_idx = idx + 1
matched = True
break
if not matched and bbox[1] >= self.rows[-1].max_y:
self._append_row(column_name, text, bbox)
def _calculate_overlap(self, rect1: List[int], rect2: List[int]) -> float:
"""Calculate percentage overlap between two rectangles."""
x_left = max(rect1[0], rect2[0])
y_top = max(rect1[1], rect2[1])
x_right = min(rect1[2], rect2[2])
y_bottom = min(rect1[3], rect2[3])
if x_right < x_left or y_bottom < y_top:
return 0.0
intersection = (x_right - x_left) * (y_bottom - y_top)
min_area = min(
(rect1[2] - rect1[0]) * (rect1[3] - rect1[1]),
(rect2[2] - rect2[0]) * (rect2[3] - rect2[1])
)
return (intersection / min_area * 100) if min_area > 0 else 0
def _update_row(self, idx: int, column_name: str, text: str, bbox: List[int]) -> None:
"""Update existing row with new cell data."""
self.rows[idx].cells[column_name] = TableCell(text, bbox, column_name)
self.rows[idx].min_x = min(self.rows[idx].min_x, bbox[0])
self.rows[idx].max_x = max(self.rows[idx].max_x, bbox[2])
def _insert_row(self, idx: int, column_name: str, text: str, bbox: List[int]) -> None:
"""Insert new row at specified index."""
self.rows.insert(idx, TableRow(
cells={column_name: TableCell(text, bbox, column_name)},
min_x=bbox[0],
max_x=bbox[2],
min_y=bbox[1],
max_y=bbox[3]
))
def _append_row(self, column_name: str, text: str, bbox: List[int]) -> None:
"""Append new row at the end."""
self.rows.append(TableRow(
cells={column_name: TableCell(text, bbox, column_name)},
min_x=bbox[0],
max_x=bbox[2],
min_y=bbox[1],
max_y=bbox[3]
))
def _to_dataframe(self, columns: List[str]) -> pd.DataFrame:
"""Convert table structure to DataFrame."""
data = []
for row in self.rows:
row_data = {
col: row.cells[col].value if col in row.cells else None
for col in columns
}
row_data.update({
'row_min_x': row.min_x,
'row_max_x': row.max_x,
'row_min_y': row.min_y,
'row_max_y': row.max_y
})
data.append(row_data)
return pd.DataFrame(data)