Spaces:

Knightmovies
/

ScannerUniversalRotator

Sleeping

App Files Files Community

Knightmovies commited on 24 days ago

Commit

97aecbf

verified ·

1 Parent(s): 7e2ff95

Update app.py

Browse files

Files changed (1) hide show

app.py +131 -34

app.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import streamlit as st
 import cv2
 import numpy as np
@@ -5,7 +8,6 @@ from PIL import Image
 import torch
 from transformers import TableTransformerForObjectDetection, DetrImageProcessor
 import pytesseract
-from scipy.spatial import distance as dist
 # ==============================================================================
 # UI config
@@ -18,7 +20,7 @@ st.markdown("""
 """, unsafe_allow_html=True)
 # ==============================================================================
-# Load model
 # ==============================================================================
 @st.cache_resource
 def load_model():
@@ -31,10 +33,11 @@ def load_model():
     model.eval()
     return model, proc
-model, processor = load_model()
 # ==============================================================================
-# Helpers
 # ==============================================================================
 def order_points(pts):
     xSorted = pts[np.argsort(pts[:, 0]), :]
@@ -63,33 +66,27 @@ def touches_border(cnt, w, h, m=12):
     return sides >= 3
 def find_page_quad(image, min_area_ratio=0.85):
-    """Return 4-point page quad only if it looks like the outer page."""
     h, w = image.shape[:2]
     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
     gray = cv2.GaussianBlur(gray, (5,5), 0)
     edges = cv2.Canny(gray, 50, 150)
     edges = cv2.dilate(edges, np.ones((3,3), np.uint8), 1)
     cnts, _ = cv2.findContours(edges, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
     if not cnts:
         return None
-    best = None; best_area = 0
-    img_area = w * h
     for c in cnts:
         peri = cv2.arcLength(c, True)
-        approx = cv2.approxPolyDP(c, 0.02 * peri, True)
-        if len(approx) != 4:
             continue
         area = cv2.contourArea(approx)
-        if area > best_area and touches_border(approx, w, h) and (area / img_area) >= min_area_ratio:
-            best_area = area
-            best = approx.reshape(4,2)
-    return best  # None if not a real page quad
 def correct_orientation(image):
-    """Rotate according to Tesseract OSD (CW angle), fallback heuristic."""
     try:
         osd = pytesseract.image_to_osd(image, output_type=pytesseract.Output.DICT, timeout=5)
         rotation = int(osd.get("rotate", 0))
@@ -98,6 +95,7 @@ def correct_orientation(image):
             return cv2.rotate(image, rot_map[rotation])
         return image
     except Exception:
         gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
         thr = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
         rots = {0: thr, 90: cv2.rotate(thr, cv2.ROTATE_90_CLOCKWISE),
@@ -116,12 +114,10 @@ def correct_orientation(image):
         return image
 def deskew_hough(image):
-    """Use dominant Hough-line angle for small residual tilt (no cropping)."""
     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
     edges = cv2.Canny(gray, 50, 150, apertureSize=3)
     lines = cv2.HoughLines(edges, 1, np.pi/180, threshold=200)
     if lines is None:
-        # fallback: minAreaRect on ink pixels
         thr = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
         coords = np.column_stack(np.where(thr == 0))
         if len(coords) < 100:
@@ -143,27 +139,117 @@ def deskew_hough(image):
     return cv2.warpAffine(image, M, (w,h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE)
 def preprocess(image):
-    """Full pipeline that NEVER crops to the inner table."""
-    # 1) fix upside-down / sideways
     oriented = correct_orientation(image)
-    # 2) warp only if we really found the OUTER page; otherwise keep full image
     quad = find_page_quad(oriented, min_area_ratio=0.85)
     if quad is not None:
         warped = four_point_warp(oriented, quad)
     else:
-        warped = oriented  # keep full page (prevents accidental crop to table)
-    # 3) small deskew using Hough median angle so grid aligns to axes
     return deskew_hough(warped)
-def extract_and_draw_table_structure(image_bgr):
-    """Run TableTransformer and draw table/table row/table column boxes."""
     image_pil = Image.fromarray(cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB))
     inputs = processor(images=image_pil, return_tensors="pt")
     with torch.inference_mode():
         outputs = model(**inputs)
     h, w = image_bgr.shape[:2]
     target_sizes = torch.tensor([[h, w]], dtype=torch.float32)
     results = processor.post_process_object_detection(outputs, threshold=0.6, target_sizes=target_sizes)[0]
@@ -191,6 +277,12 @@ if "stage" not in st.session_state:
 with st.sidebar:
     st.title("🤖 Document AI Toolkit")
     st.markdown("---")
     if st.button("🔄 Start Over", use_container_width=True):
         for k in list(st.session_state.keys()): del st.session_state[k]
         st.rerun()
@@ -206,17 +298,22 @@ with st.sidebar:
     elif st.session_state.stage == "processing":
         st.header("Step 2: Pre-process")
         if st.button("▶️ Start Pre-processing", use_container_width=True, type="primary"):
-            with st.spinner("Correcting orientation • detecting true page • deskewing…"):
                 st.session_state.processed_image = preprocess(st.session_state.original_image)
             st.session_state.stage = "analysis"; st.rerun()
     elif st.session_state.stage == "analysis":
         st.header("Step 3: Analyze Table")
         if st.button("📊 Find Table Structure", use_container_width=True, type="primary"):
-            with st.spinner("Running Table Transformer…"):
-                st.session_state.annotated_image = extract_and_draw_table_structure(
-                    st.session_state.processed_image
-                )
             st.session_state.stage = "done"; st.rerun()
 st.title("Document Processing Workflow")

+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import streamlit as st
 import cv2
 import numpy as np
 import torch
 from transformers import TableTransformerForObjectDetection, DetrImageProcessor
 import pytesseract
 # ==============================================================================
 # UI config
 """, unsafe_allow_html=True)
 # ==============================================================================
+# Load model (only if you use the Transformer option)
 # ==============================================================================
 @st.cache_resource
 def load_model():
     model.eval()
     return model, proc
+# Lazy-init so the app runs even if you only use line-based
+model, processor = None, None
 # ==============================================================================
+# Page-safe preprocessing (no inner-table cropping) + robust deskew
 # ==============================================================================
 def order_points(pts):
     xSorted = pts[np.argsort(pts[:, 0]), :]
     return sides >= 3
 def find_page_quad(image, min_area_ratio=0.85):
     h, w = image.shape[:2]
     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
     gray = cv2.GaussianBlur(gray, (5,5), 0)
     edges = cv2.Canny(gray, 50, 150)
     edges = cv2.dilate(edges, np.ones((3,3), np.uint8), 1)
     cnts, _ = cv2.findContours(edges, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
     if not cnts:
         return None
+    best, best_area = None, 0
+    img_area = w*h
     for c in cnts:
         peri = cv2.arcLength(c, True)
+        approx = cv2.approxPolyDP(c, 0.02*peri, True)
+        if len(approx) != 4:
             continue
         area = cv2.contourArea(approx)
+        if area > best_area and touches_border(approx, w, h) and (area/img_area) >= min_area_ratio:
+            best_area, best = area, approx.reshape(4,2)
+    return best
 def correct_orientation(image):
     try:
         osd = pytesseract.image_to_osd(image, output_type=pytesseract.Output.DICT, timeout=5)
         rotation = int(osd.get("rotate", 0))
             return cv2.rotate(image, rot_map[rotation])
         return image
     except Exception:
+        # Fallback heuristic
         gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
         thr = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
         rots = {0: thr, 90: cv2.rotate(thr, cv2.ROTATE_90_CLOCKWISE),
         return image
 def deskew_hough(image):
     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
     edges = cv2.Canny(gray, 50, 150, apertureSize=3)
     lines = cv2.HoughLines(edges, 1, np.pi/180, threshold=200)
     if lines is None:
         thr = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
         coords = np.column_stack(np.where(thr == 0))
         if len(coords) < 100:
     return cv2.warpAffine(image, M, (w,h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE)
 def preprocess(image):
     oriented = correct_orientation(image)
     quad = find_page_quad(oriented, min_area_ratio=0.85)
     if quad is not None:
         warped = four_point_warp(oriented, quad)
     else:
+        warped = oriented
     return deskew_hough(warped)
+# ==============================================================================
+# LINE-BASED table structure (precise on ruled tables)
+# ==============================================================================
+def merge_close_positions(positions, tol=8):
+    if not positions:
+        return []
+    positions = sorted(positions)
+    merged, cluster = [], [positions[0]]
+    for p in positions[1:]:
+        if abs(p - cluster[-1]) <= tol:
+            cluster.append(p)
+        else:
+            merged.append(int(round(np.mean(cluster))))
+            cluster = [p]
+    merged.append(int(round(np.mean(cluster))))
+    return merged
+def detect_grid_lines(image_bgr):
+    """Return x and y grid line positions (pixels)."""
+    gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
+    # Good default for scans/photos
+    binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
+                                   cv2.THRESH_BINARY_INV, 15, 7)
+    h, w = gray.shape
+    # Kernels sized to image
+    vert_len = max(10, h // 40)
+    horz_len = max(10, w // 40)
+    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vert_len))
+    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (horz_len, 1))
+    # Extract vertical lines
+    vtemp = cv2.erode(binary, vertical_kernel, iterations=1)
+    vlines = cv2.dilate(vtemp, vertical_kernel, iterations=1)
+    # Extract horizontal lines
+    htemp = cv2.erode(binary, horizontal_kernel, iterations=1)
+    hlines = cv2.dilate(htemp, horizontal_kernel, iterations=1)
+    # Find vertical positions by components that span most of the height
+    xs = []
+    cnts,_ = cv2.findContours(vlines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    for c in cnts:
+        x,y,wc,hc = cv2.boundingRect(c)
+        if hc >= 0.65*h and wc <= 0.04*w:
+            xs.append(x + wc//2)
+    # Find horizontal positions by components that span most of the width
+    ys = []
+    cnts,_ = cv2.findContours(hlines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    for c in cnts:
+        x,y,wc,hc = cv2.boundingRect(c)
+        if wc >= 0.65*w and hc <= 0.04*h:
+            ys.append(y + hc//2)
+    # Merge duplicates / double rules
+    # Adaptive tolerance: 8px or 30% of median spacing
+    def adaptive_tol(vals):
+        if len(vals) < 3:
+            return 8
+        diffs = np.diff(sorted(vals))
+        med = np.median(diffs)
+        return int(max(6, min(20, 0.3*med)))
+    xs = merge_close_positions(xs, tol=adaptive_tol(xs))
+    ys = merge_close_positions(ys, tol=adaptive_tol(ys))
+    # Require at least 2 lines each to make a grid
+    if len(xs) < 2 or len(ys) < 2:
+        return [], []
+    return xs, ys
+def draw_grid(image_bgr, xs, ys):
+    img = image_bgr.copy()
+    # Outer table box (min/max)
+    cv2.rectangle(img, (xs[0], ys[0]), (xs[-1], ys[-1]), (255, 0, 255), 2)
+    # Horizontal lines (green)
+    for y in ys:
+        cv2.line(img, (xs[0], y), (xs[-1], y), (0, 255, 0), 2)
+    # Vertical lines (blue)
+    for x in xs:
+        cv2.line(img, (x, ys[0]), (x, ys[-1]), (255, 0, 0), 2)
+    return img
+def line_based_table_structure(image_bgr):
+    xs, ys = detect_grid_lines(image_bgr)
+    if not xs or not ys:
+        # Nothing reliable found; just return original
+        return image_bgr.copy()
+    return draw_grid(image_bgr, xs, ys)
+# ==============================================================================
+# Transformer (TATR) structure (kept for comparison)
+# ==============================================================================
+def transformer_table_structure(image_bgr):
+    global model, processor
+    if model is None or processor is None:
+        model, processor = load_model()
     image_pil = Image.fromarray(cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB))
     inputs = processor(images=image_pil, return_tensors="pt")
     with torch.inference_mode():
         outputs = model(**inputs)
     h, w = image_bgr.shape[:2]
     target_sizes = torch.tensor([[h, w]], dtype=torch.float32)
     results = processor.post_process_object_detection(outputs, threshold=0.6, target_sizes=target_sizes)[0]
 with st.sidebar:
     st.title("🤖 Document AI Toolkit")
     st.markdown("---")
+    method = st.radio(
+        "Structure method",
+        ["Line-based (ruled tables) – Recommended", "Transformer (TATR)"],
+    )
     if st.button("🔄 Start Over", use_container_width=True):
         for k in list(st.session_state.keys()): del st.session_state[k]
         st.rerun()
     elif st.session_state.stage == "processing":
         st.header("Step 2: Pre-process")
         if st.button("▶️ Start Pre-processing", use_container_width=True, type="primary"):
+            with st.spinner("Correcting orientation • detecting page • deskewing…"):
                 st.session_state.processed_image = preprocess(st.session_state.original_image)
             st.session_state.stage = "analysis"; st.rerun()
     elif st.session_state.stage == "analysis":
         st.header("Step 3: Analyze Table")
         if st.button("📊 Find Table Structure", use_container_width=True, type="primary"):
+            with st.spinner("Detecting grid…"):
+                if method.startswith("Line-based"):
+                    st.session_state.annotated_image = line_based_table_structure(
+                        st.session_state.processed_image
+                    )
+                else:
+                    st.session_state.annotated_image = transformer_table_structure(
+                        st.session_state.processed_image
+                    )
             st.session_state.stage = "done"; st.rerun()
 st.title("Document Processing Workflow")