table-extraction / src /table_creator /table_extractor.py

Sudhanshu Pandey

adding to spaces

2e79922 about 2 months ago

6.71 kB

	from src.models.table_detector import TableDetector
	from src.models.text_recognizer import TextRecognizer
	from src.table_creator.data_structures import TableStructure
	import pandas as pd
	import re

	class TableExtraction:
	def __init__(self) -> None:
	self._table_detection = TableDetector()
	self._document_ocr = TextRecognizer()
	self._linklist = TableStructure()

	def _merge_words(self, prev_obj, word, word_bb):
	"""Merge the current word with the previous one if they overlap significantly."""
	merged_text = prev_obj[0] + ' ' + word
	merged_bb = [
	prev_obj[1][0], prev_obj[1][1], word_bb[2], word_bb[3]
	]
	return (merged_text, merged_bb)

	def _assign_to_column(self, word, word_bb, columns, df, debug=False):
	"""Assign a word to the correct column based on bounding box overlap."""
	for key, col_bb in columns.items():
	word_bb_temp = [word_bb[0], col_bb[1], word_bb[2], col_bb[3]]
	overlap = self._table_detection._calculate_overlap(word_bb_temp, col_bb)

	if overlap > 10:
	if len(df[key]) > 0:
	prev_obj = df[key][-1]
	prev_overlap = self._table_detection._calculate_overlap(
	prev_obj[1], [prev_obj[1][0], word_bb[1], prev_obj[1][2], word_bb[3]]
	)
	if prev_overlap >= 30:
	word, word_bb = self._merge_words(prev_obj, word, word_bb)
	df[key][-1] = (word, word_bb)
	else:
	df[key].append((word, word_bb))
	else:
	df[key].append((word, word_bb))
	# Dynamically adjust the column bounding box to fit the new word
	columns[key] = [
	min(word_bb[0], col_bb[0]), col_bb[1],
	max(word_bb[2], col_bb[2]), col_bb[3]
	]
	return True
	return False

	def _get_normalized_bounding_box(self, imgsz : str, bb : list) -> pd.DataFrame:
	names = ['pdf1','sample_pdf2.pdf']
	pass

	def get_words_in_column(self, cords: dict, df_word: pd.DataFrame, merge=True, debug=False):
	"""Distribute words into their respective columns based on bounding box coordinates."""
	df = {key: [] for key in cords}
	unknown_columns = {}
	unknown_data = {}

	for index, row in df_word.iterrows():
	word, word_bb = row['text'], list(map(int, row['boundingBox']))
	if debug:
	print(f"\nProcessing word: '{word}'")

	if not self._assign_to_column(word, word_bb, cords, df, debug):
	# Handle words that do not match any known column
	for key, val in unknown_columns.items():
	overlap = self._table_detection._calculate_overlap(
	val, [word_bb[0], val[1], word_bb[2], val[3]]
	)
	if overlap > 30:
	prev_obj = unknown_data[key][-1]
	prev_overlap = self._table_detection._calculate_overlap(
	prev_obj[1], [prev_obj[1][0], word_bb[1], prev_obj[1][2], word_bb[3]]
	)
	if prev_overlap >= 30:
	word, word_bb = self._merge_words(prev_obj, word, word_bb)
	unknown_data[key][-1] = (word, word_bb)
	else:
	unknown_data[key].append((word, word_bb))
	break
	else:
	# Create a new unknown column if no match is found
	unknown_key = f'{word}__{index}__'
	unknown_columns[unknown_key] = word_bb
	unknown_data[unknown_key] = [(word, word_bb)]

	if merge:
	df.update(unknown_data)

	# Convert lists to DataFrames
	df = {key: pd.DataFrame(val, columns=['text', 'boundingBox']) for key, val in df.items()}
	return df, unknown_data, unknown_columns

	def postprocess(self, parsed_df: pd.DataFrame, columns=None):
	"""Post-process the parsed DataFrame to merge columns and clean data."""
	try:
	parsed_df = parsed_df.dropna(how='all').reset_index(drop=True)
	new_df = pd.DataFrame()

	# Merge adjacent empty header columns
	empty_columns = parsed_df.columns[parsed_df.iloc[:1].isna().all()].tolist()
	for col in empty_columns[::-1]:
	col_idx = list(parsed_df.columns).index(col)
	if col_idx > 0:
	parsed_df.iloc[:, col_idx - 1] += ' ' + parsed_df.iloc[:, col_idx]
	parsed_df = parsed_df.drop(columns=empty_columns)

	if not columns:
	return parsed_df

	used_indices = set()
	for header in columns:
	match_indices = [i for i, col in enumerate(parsed_df.columns) if header in col]
	if match_indices:
	used_indices.update(match_indices)
	new_df[header] = parsed_df.iloc[:, match_indices].apply(
	lambda x: ' '.join(x.fillna('').str.strip()), axis=1
	)

	# Include unused columns
	unused_columns = [col for i, col in enumerate(parsed_df.columns) if i not in used_indices]
	new_df = pd.concat([new_df, parsed_df[unused_columns]], axis=1)

	return new_df
	except Exception as e:
	print(f"Error in postprocess: {e}")
	return parsed_df

	def detect(self, image_path: str):
	"""Detect tables in an image and extract their data."""
	cords = self._table_detection.detect(image_path)
	all_table_df = self._document_ocr.recognize(image_path, cords)

	table_data = []
	for table in all_table_df:
	column_data, _, _ = self.get_words_in_column({}, table)
	ordered_columns = sorted(column_data, key=lambda x: column_data[x].iloc[0]['boundingBox'][0])
	dictword = {col: column_data[col] for col in ordered_columns}

	df = self._linklist.build_structure(dictword)
	df = df.loc[:, ordered_columns]
	df = df.rename(columns=lambda col: re.sub(r'__\d+__', '', str(col)).strip())
	df_postp = self.postprocess(df)

	# Assign generic column names
	df.columns = [f"column {i+1}" for i in range(df.shape[1])]
	table_data.append((df, df_postp))

	return table_data[0], cords