from src.table_creator.table_extractor import TableExtraction
import streamlit as st
import base64
from PIL import Image
import os
import cv2
import numpy as np
import tempfile
import traceback
# Load models only once
if 'tab_ext' not in st.session_state:
st.session_state.tab_ext = TableExtraction()
print('Models loaded.')
def process_image(imgpath):
return st.session_state.tab_ext.detect(imgpath)
def draw_bounding_box(image, bbox):
"""Draw a bounding box on the image"""
img_array = np.array(image)
if len(img_array.shape) == 3:
img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
x_min, y_min, x_max, y_max = bbox
cv2.rectangle(img_array, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
if len(img_array.shape) == 3:
img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)
return Image.fromarray(img_array)
# Set page config
st.set_page_config(
page_title="Table Extraction Tool",
layout="wide",
initial_sidebar_state="expanded" # Changed to expanded to show guide by default
)
# Enhanced CSS styling with updated upload section
st.markdown("""
""", unsafe_allow_html=True)
# Create sidebar with guide content
with st.sidebar:
# st.markdown('
', unsafe_allow_html=True)
st.divider()
st.markdown('', unsafe_allow_html=True)
# How It Works section
st.markdown('', unsafe_allow_html=True)
st.markdown("""
This tool uses advanced computer vision and machine learning techniques to:
- Detect and locate tables in document images
- Extract structured data from the detected tables
- Convert the data into easily manageable formats
""", unsafe_allow_html=True)
# Usage Instructions
st.markdown('', unsafe_allow_html=True)
st.markdown("""
1
Upload a document image containing a table (JPG, or JPEG format)
2
The tool will automatically detect and highlight the table in your image
3
View both raw and enhanced versions of the extracted data
4
Download the results in CSV format for further use
""", unsafe_allow_html=True)
# Best Practices
st.markdown('', unsafe_allow_html=True)
st.markdown("""
For Best Results:
- Use clear, high-resolution images
- Ensure tables have well-defined borders
- Avoid skewed or rotated images
- Make sure text is clearly readable
""", unsafe_allow_html=True)
# Technical Details (collapsible)
with st.expander("đ§ Technical Details"):
st.markdown("""
Algorithm Overview:
- Uses computer vision for table boundary detection
- Employs OCR (Optical Character Recognition) for text extraction
- Implements intelligent cell segmentation
- Applies post-processing for enhanced accuracy
""", unsafe_allow_html=True)
# Support Info
st.markdown('', unsafe_allow_html=True)
st.markdown("""
If you encounter any issues or have questions, feel free to reach out:
GitHub
|
LinkedIn
|
Medium
""", unsafe_allow_html=True)
# Initialize session state for expanded view
if 'is_expanded' not in st.session_state:
st.session_state.is_expanded = False
# Title and description
st.markdown("""
đ Table Extraction Tool
Upload an image containing tables and instantly convert them into structured data formats.
""", unsafe_allow_html=True)
# File upload section - Reduced size
# st.markdown('
', unsafe_allow_html=True)
# st.markdown("""
#
#
đĨ
#
Upload Table Image
#
Supported formats: PNG, JPG, JPEG
#
# """, unsafe_allow_html=True)
uploaded_file = st.file_uploader("", type=['png', 'jpg', 'jpeg'])
st.markdown('
', unsafe_allow_html=True)
# Process the uploaded file
if uploaded_file is not None:
with st.spinner('đ Processing your image...'):
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file:
tmp_file.write(uploaded_file.getvalue())
temp_path = tmp_file.name
try:
image = Image.open(uploaded_file)
(raw_df, cleaned_df), bbox = process_image(temp_path)
st.session_state.raw_data = raw_df
st.session_state.processed_data = cleaned_df
marked_image = draw_bounding_box(image, bbox[0])
st.session_state.marked_image = marked_image
# Side by side layout
col1, col2 = st.columns([0.4, 0.6])
with col1:
# st.markdown('
', unsafe_allow_html=True)
st.divider()
st.markdown('', unsafe_allow_html=True)
st.image(marked_image, use_container_width=True)
st.markdown('
', unsafe_allow_html=True)
with col2:
# st.markdown('
', unsafe_allow_html=True)
st.divider()
st.markdown('', unsafe_allow_html=True)
# # Toggle button for expanded view
# if st.button("đ Toggle Full View" if not st.session_state.is_expanded else "âŦī¸ Collapse View"):
# st.session_state.is_expanded = not st.session_state.is_expanded
tabs = st.tabs(["đ Raw Data", "⨠Enhanced Data â"])
with tabs[0]:
st.dataframe(st.session_state.raw_data,
use_container_width=True,
height=600 if not st.session_state.is_expanded else None)
# Add HTML copy section for raw data
st.markdown("### đ Copy HTML Table")
html_raw = st.session_state.raw_data.to_html(index=False)
st.markdown("""
âšī¸ This HTML can be copied and used directly in websites, LLM prompts, or other applications.
""", unsafe_allow_html=True)
st.markdown("""
""", unsafe_allow_html=True)
st.code(html_raw, language="html")
st.markdown("
", unsafe_allow_html=True)
with tabs[1]:
st.markdown("""
â This is our enhanced version of the table with improved formatting and structure.
""", unsafe_allow_html=True)
st.dataframe(st.session_state.processed_data,
use_container_width=True,
height=600 if not st.session_state.is_expanded else None)
# Add HTML copy section for enhanced data
st.markdown("### đ Copy HTML Table")
html_enhanced = st.session_state.processed_data.to_html(index=False)
st.markdown("""
âšī¸ This HTML can be copied and used directly in websites, LLM prompts, or other applications.
""", unsafe_allow_html=True)
st.markdown("""
""", unsafe_allow_html=True)
st.code(html_enhanced, language="html")
st.markdown("
", unsafe_allow_html=True)
# st.markdown('
', unsafe_allow_html=True)
# Download section below both columns
# st.markdown('
', unsafe_allow_html=True)
# Download section below both columns
st.divider()
st.markdown('', unsafe_allow_html=True)
download_cols = st.columns([1, 0.1, 1])
def get_csv_download_link(df, filename):
csv = df.to_csv(index=False).encode()
b64 = base64.b64encode(csv).decode()
return f'
đĨ Download {filename}'
with download_cols[0]:
if 'raw_data' in st.session_state:
csv = st.session_state.raw_data.to_csv(index=False)
st.download_button(
label="đĨ Download Raw Data",
data=csv,
file_name="raw_data.csv",
mime="text/csv",
use_container_width=True,
key="raw_download"
)
with download_cols[2]:
if 'processed_data' in st.session_state:
csv = st.session_state.processed_data.to_csv(index=False)
st.download_button(
label="đĨ Download Enhanced Data â",
data=csv,
file_name="enhanced_data.csv",
mime="text/csv",
use_container_width=True,
key="enhanced_download"
)
st.markdown('
', unsafe_allow_html=True)
except Exception as e:
st.error(f"â Error processing image: {str(traceback.format_exc())}")
finally:
try:
os.unlink(temp_path)
except Exception as e:
st.warning(f"â ī¸ Error removing temporary file: {str(e)}")