|
from src.table_creator.table_extractor import TableExtraction |
|
import streamlit as st |
|
import base64 |
|
from PIL import Image |
|
import os |
|
import cv2 |
|
import numpy as np |
|
import tempfile |
|
import traceback |
|
|
|
|
|
if 'tab_ext' not in st.session_state: |
|
st.session_state.tab_ext = TableExtraction() |
|
print('Models loaded.') |
|
|
|
|
|
def process_image(imgpath): |
|
return st.session_state.tab_ext.detect(imgpath) |
|
|
|
def draw_bounding_box(image, bbox): |
|
"""Draw a bounding box on the image""" |
|
|
|
|
|
img_array = np.array(image) |
|
if len(img_array.shape) == 3: |
|
img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR) |
|
|
|
x_min, y_min, x_max, y_max = bbox |
|
cv2.rectangle(img_array, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2) |
|
|
|
if len(img_array.shape) == 3: |
|
img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB) |
|
|
|
return Image.fromarray(img_array) |
|
|
|
|
|
|
|
st.set_page_config( |
|
page_title="Table Extraction Tool", |
|
layout="wide", |
|
initial_sidebar_state="expanded" |
|
) |
|
|
|
|
|
|
|
st.markdown(""" |
|
<style> |
|
/* Main container and background */ |
|
.main { padding: 1.5rem; } |
|
.stApp { |
|
background: linear-gradient(135deg, #f6f9fc 0%, #f0f4f8 100%); |
|
} |
|
|
|
/* Header styling */ |
|
.main-header { |
|
background: linear-gradient(90deg, #1a365d 0%, #2563eb 100%); |
|
color: white; |
|
padding: 2rem 3rem; |
|
border-radius: 15px; |
|
margin-bottom: 2rem; |
|
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1); |
|
} |
|
|
|
.main-header h1 { |
|
font-size: 2.5rem; |
|
margin-bottom: 0.5rem; |
|
font-weight: 600; |
|
color: white; |
|
} |
|
|
|
.main-header p { |
|
font-size: 1.1rem; |
|
opacity: 0.9; |
|
} |
|
|
|
/* Card containers */ |
|
.content-card { |
|
background-color: white; |
|
padding: 1.5rem; |
|
border-radius: 12px; |
|
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1); |
|
border: 1px solid #e5e7eb; |
|
margin-bottom: 1.5rem; |
|
} |
|
|
|
/* Upload section - Reduced size */ |
|
.upload-section { |
|
text-align: center; |
|
padding: 1rem; |
|
border: 2px dashed #e5e7eb; |
|
border-radius: 12px; |
|
background-color: #f8fafc; |
|
max-width: 600px; |
|
margin: 0 auto; |
|
} |
|
|
|
.upload-icon { |
|
font-size: 1.5rem; |
|
color: #2563eb; |
|
margin-bottom: 0.5rem; |
|
} |
|
|
|
/* Results section */ |
|
.results-header { |
|
font-size: 1.25rem; |
|
color: #1f2937; |
|
margin-bottom: 1rem; |
|
padding-bottom: 0.5rem; |
|
border-bottom: 2px solid #e5e7eb; |
|
} |
|
|
|
/* Download buttons */ |
|
.download-button { |
|
display: inline-block; |
|
padding: 0.75rem 1.5rem; |
|
background-color: #2563eb; |
|
color: white; |
|
text-decoration: none; |
|
border-radius: 8px; |
|
transition: all 0.2s; |
|
text-align: center; |
|
width: 100%; |
|
} |
|
|
|
.download-button:hover { |
|
background-color: #1d4ed8; |
|
box-shadow: 0 4px 6px -1px rgba(37, 99, 235, 0.2); |
|
} |
|
|
|
/* Tabs styling */ |
|
.stTabs [data-baseweb="tab-list"] { |
|
gap: 1rem; |
|
background-color: #f8fafc; |
|
padding: 0.5rem; |
|
border-radius: 8px; |
|
} |
|
|
|
.stTabs [data-baseweb="tab"] { |
|
color: #4b5563; |
|
font-weight: 500; |
|
padding: 0.5rem 1.5rem; |
|
border-radius: 6px; |
|
} |
|
|
|
.stTabs [data-baseweb="tab"][aria-selected="true"] { |
|
background-color: #2563eb; |
|
color: white; |
|
} |
|
|
|
/* Guide section styling */ |
|
.guide-section { |
|
background-color: white; |
|
padding: 2rem; |
|
border-radius: 12px; |
|
margin-bottom: 1.5rem; |
|
} |
|
|
|
.guide-header { |
|
color: #1a365d; |
|
font-size: 1.5rem; |
|
margin-bottom: 1rem; |
|
border-bottom: 2px solid #e5e7eb; |
|
padding-bottom: 0.5rem; |
|
} |
|
|
|
.guide-subheader { |
|
color: #2563eb; |
|
font-size: 1.2rem; |
|
margin: 1.5rem 0 0.5rem 0; |
|
} |
|
|
|
.guide-text { |
|
color: #4b5563; |
|
line-height: 1.6; |
|
margin-bottom: 1rem; |
|
} |
|
|
|
.feature-card { |
|
background-color: #f8fafc; |
|
padding: 1rem; |
|
border-radius: 8px; |
|
margin-bottom: 1rem; |
|
border-left: 4px solid #2563eb; |
|
} |
|
|
|
.step-container { |
|
display: flex; |
|
align-items: flex-start; |
|
margin-bottom: 1rem; |
|
} |
|
|
|
.step-number { |
|
background-color: #2563eb; |
|
color: white; |
|
width: 24px; |
|
height: 24px; |
|
border-radius: 12px; |
|
display: flex; |
|
align-items: center; |
|
justify-content: center; |
|
margin-right: 1rem; |
|
flex-shrink: 0; |
|
} |
|
|
|
.info-icon { |
|
color: #2563eb; |
|
margin-right: 0.5rem; |
|
} |
|
|
|
.tech-details { |
|
background-color: #f0f9ff; |
|
padding: 1rem; |
|
border-radius: 8px; |
|
margin: 1rem 0; |
|
} |
|
|
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
|
with st.sidebar: |
|
|
|
st.divider() |
|
st.markdown('<h2 class="guide-header">π User Guide</h2>', unsafe_allow_html=True) |
|
|
|
|
|
st.markdown('<h3 class="guide-subheader">π― How It Works</h3>', unsafe_allow_html=True) |
|
st.markdown(""" |
|
<div class="guide-text"> |
|
This tool uses advanced computer vision and machine learning techniques to: |
|
<ul> |
|
<li>Detect and locate tables in document images</li> |
|
<li>Extract structured data from the detected tables</li> |
|
<li>Convert the data into easily manageable formats</li> |
|
</ul> |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown('<h3 class="guide-subheader">π Usage Instructions</h3>', unsafe_allow_html=True) |
|
|
|
st.markdown(""" |
|
<div class="step-container"> |
|
<div class="step-number">1</div> |
|
<div class="guide-text">Upload a document image containing a table (JPG, or JPEG format)</div> |
|
</div> |
|
|
|
<div class="step-container"> |
|
<div class="step-number">2</div> |
|
<div class="guide-text">The tool will automatically detect and highlight the table in your image</div> |
|
</div> |
|
|
|
<div class="step-container"> |
|
<div class="step-number">3</div> |
|
<div class="guide-text">View both raw and enhanced versions of the extracted data</div> |
|
</div> |
|
|
|
<div class="step-container"> |
|
<div class="step-number">4</div> |
|
<div class="guide-text">Download the results in CSV format for further use</div> |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown('<h3 class="guide-subheader">π‘ Best Practices</h3>', unsafe_allow_html=True) |
|
st.markdown(""" |
|
<div class="feature-card"> |
|
<strong>For Best Results:</strong> |
|
<ul> |
|
<li>Use clear, high-resolution images</li> |
|
<li>Ensure tables have well-defined borders</li> |
|
<li>Avoid skewed or rotated images</li> |
|
<li>Make sure text is clearly readable</li> |
|
</ul> |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
with st.expander("π§ Technical Details"): |
|
st.markdown(""" |
|
<div class="tech-details"> |
|
<p><strong>Algorithm Overview:</strong></p> |
|
<ul> |
|
<li>Uses computer vision for table boundary detection</li> |
|
<li>Employs OCR (Optical Character Recognition) for text extraction</li> |
|
<li>Implements intelligent cell segmentation</li> |
|
<li>Applies post-processing for enhanced accuracy</li> |
|
</ul> |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown('<h3 class="guide-subheader">π Connect with Me</h3>', unsafe_allow_html=True) |
|
st.markdown(""" |
|
<div class="guide-text" style="font-size: 1rem;"> |
|
If you encounter any issues or have questions, feel free to reach out: |
|
<a href="https://github.com/Sudhanshu1304" target="_blank" style="text-decoration: none;"> |
|
<img src="https://img.icons8.com/ios-filled/20/000000/github.png" alt="GitHub" style="vertical-align: middle; margin-right: 5px;"/> |
|
GitHub |
|
</a> | |
|
<a href="https://www.linkedin.com/in/sudhanshu-pandey-847448193/" target="_blank" style="text-decoration: none;"> |
|
<img src="https://img.icons8.com/ios-filled/20/000000/linkedin.png" alt="LinkedIn" style="vertical-align: middle; margin-right: 5px;"/> |
|
LinkedIn |
|
</a> | |
|
<a href="https://medium.com/@sudhanshu.dpandey" target="_blank" style="text-decoration: none;"> |
|
<img src="https://img.icons8.com/ios-filled/20/000000/medium-logo.png" alt="Medium" style="vertical-align: middle; margin-right: 5px;"/> |
|
Medium |
|
</a> |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
|
if 'is_expanded' not in st.session_state: |
|
st.session_state.is_expanded = False |
|
|
|
|
|
|
|
st.markdown(""" |
|
<div class="main-header"> |
|
<h1>π Table Extraction Tool</h1> |
|
<p>Upload an image containing tables and instantly convert them into structured data formats.</p> |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uploaded_file = st.file_uploader("", type=['png', 'jpg', 'jpeg']) |
|
st.markdown('</div>', unsafe_allow_html=True) |
|
|
|
|
|
if uploaded_file is not None: |
|
with st.spinner('π Processing your image...'): |
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as tmp_file: |
|
tmp_file.write(uploaded_file.getvalue()) |
|
temp_path = tmp_file.name |
|
|
|
try: |
|
image = Image.open(uploaded_file) |
|
(raw_df, cleaned_df), bbox = process_image(temp_path) |
|
|
|
st.session_state.raw_data = raw_df |
|
st.session_state.processed_data = cleaned_df |
|
marked_image = draw_bounding_box(image, bbox[0]) |
|
st.session_state.marked_image = marked_image |
|
|
|
|
|
col1, col2 = st.columns([0.4, 0.6]) |
|
|
|
with col1: |
|
|
|
st.divider() |
|
st.markdown('<h3 class="results-header">Detected Table</h3>', unsafe_allow_html=True) |
|
st.image(marked_image, use_container_width=True) |
|
st.markdown('</div>', unsafe_allow_html=True) |
|
|
|
with col2: |
|
|
|
st.divider() |
|
st.markdown('<h3 class="results-header">Extracted Data</h3>', unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
|
|
|
tabs = st.tabs(["π Raw Data", "β¨ Enhanced Data β"]) |
|
|
|
with tabs[0]: |
|
st.dataframe(st.session_state.raw_data, |
|
use_container_width=True, |
|
height=600 if not st.session_state.is_expanded else None) |
|
|
|
|
|
st.markdown("### π Copy HTML Table") |
|
html_raw = st.session_state.raw_data.to_html(index=False) |
|
st.markdown(""" |
|
<div style="background-color: #f8fafc; padding: 0.5rem; border-radius: 8px; margin-bottom: 0.5rem;"> |
|
<p style="margin: 0; color: #475569; font-size: 0.9rem;"> |
|
βΉοΈ This HTML can be copied and used directly in websites, LLM prompts, or other applications. |
|
</p> |
|
</div> |
|
""", unsafe_allow_html=True) |
|
st.markdown(""" |
|
<div style="max-height: 150px; overflow-y: auto; border-radius: 8px;"> |
|
""", unsafe_allow_html=True) |
|
st.code(html_raw, language="html") |
|
st.markdown("</div>", unsafe_allow_html=True) |
|
|
|
with tabs[1]: |
|
st.markdown(""" |
|
<div style="background-color: #f0f9ff; padding: 1rem; border-radius: 8px; margin-bottom: 1rem;"> |
|
<p style="margin: 0; color: #1e40af;"> |
|
β This is our enhanced version of the table with improved formatting and structure. |
|
</p> |
|
</div> |
|
""", unsafe_allow_html=True) |
|
st.dataframe(st.session_state.processed_data, |
|
use_container_width=True, |
|
height=600 if not st.session_state.is_expanded else None) |
|
|
|
|
|
st.markdown("### π Copy HTML Table") |
|
html_enhanced = st.session_state.processed_data.to_html(index=False) |
|
st.markdown(""" |
|
<div style="background-color: #f8fafc; padding: 0.5rem; border-radius: 8px; margin-bottom: 0.5rem;"> |
|
<p style="margin: 0; color: #475569; font-size: 0.9rem;"> |
|
βΉοΈ This HTML can be copied and used directly in websites, LLM prompts, or other applications. |
|
</p> |
|
</div> |
|
""", unsafe_allow_html=True) |
|
st.markdown(""" |
|
<div style="max-height: 150px; overflow-y: auto; border-radius: 8px;"> |
|
""", unsafe_allow_html=True) |
|
st.code(html_enhanced, language="html") |
|
st.markdown("</div>", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
st.divider() |
|
st.markdown('<h3 class="results-header">Download Options</h3>', unsafe_allow_html=True) |
|
download_cols = st.columns([1, 0.1, 1]) |
|
|
|
def get_csv_download_link(df, filename): |
|
csv = df.to_csv(index=False).encode() |
|
b64 = base64.b64encode(csv).decode() |
|
return f'<a href="data:file/csv;base64,{b64}" download="{filename}" class="download-button">π₯ Download {filename}</a>' |
|
|
|
with download_cols[0]: |
|
if 'raw_data' in st.session_state: |
|
csv = st.session_state.raw_data.to_csv(index=False) |
|
st.download_button( |
|
label="π₯ Download Raw Data", |
|
data=csv, |
|
file_name="raw_data.csv", |
|
mime="text/csv", |
|
use_container_width=True, |
|
key="raw_download" |
|
) |
|
|
|
with download_cols[2]: |
|
if 'processed_data' in st.session_state: |
|
csv = st.session_state.processed_data.to_csv(index=False) |
|
st.download_button( |
|
label="π₯ Download Enhanced Data β", |
|
data=csv, |
|
file_name="enhanced_data.csv", |
|
mime="text/csv", |
|
use_container_width=True, |
|
key="enhanced_download" |
|
) |
|
st.markdown('</div>', unsafe_allow_html=True) |
|
|
|
except Exception as e: |
|
st.error(f"β Error processing image: {str(traceback.format_exc())}") |
|
|
|
finally: |
|
try: |
|
os.unlink(temp_path) |
|
except Exception as e: |
|
st.warning(f"β οΈ Error removing temporary file: {str(e)}") |
|
|