Spaces:

Knightmovies
/

ScannerUniversalRotator

Sleeping

App Files Files Community

Knightmovies commited on Sep 21

Commit

65c6a60

verified ·

1 Parent(s): 96d6384

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -138

app.py CHANGED Viewed

@@ -11,15 +11,9 @@ from scipy.spatial import distance as dist
 # App Configuration & Model Loading
 # ==============================================================================
-# For Hugging Face Spaces deployment, you also need these two files:
-# 1. requirements.txt (listing all Python libraries)
-# 2. packages.txt (containing the line "tesseract-ocr")
-# NOTE: With this new code, you can remove 'matplotlib' from requirements.txt
-# Set Streamlit page configuration
 st.set_page_config(
-    page_title="Document Scanner & Table Recognizer",
-    page_icon="📄",
     layout="wide"
 )
@@ -27,16 +21,15 @@ st.set_page_config(
 @st.cache_resource
 def load_model():
     """Loads the Table Transformer model and processor."""
-    print("Loading model...")
     processor = DetrImageProcessor.from_pretrained("microsoft/table-transformer-structure-recognition")
     model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-structure-recognition")
-    print("Model loaded successfully.")
     return processor, model
 processor, model = load_model()
 # ==============================================================================
-# Core Image Processing Functions
 # ==============================================================================
 def order_points(pts):
@@ -75,157 +68,122 @@ def find_and_straighten_document(image):
     return perspective_transform(image, box)
 def correct_orientation(image):
-    """
-    Corrects the orientation of an image using a robust cascade method.
-    1. Tries fast OSD (Orientation and Script Detection).
-    2. If OSD fails, falls back to analyzing word bounding boxes.
-    """
-    print("--- Running Orientation Check ---")
-    # --- METHOD 1: Fast Orientation and Script Detection (OSD) ---
     try:
-        osd = pytesseract.image_to_osd(image, output_type=pytesseract.Output.DICT, timeout=5)
         rotation = osd['rotate']
-        print(f"OSD check successful. Detected rotation: {rotation} degrees.")
-        if rotation > 0:
-            # Tesseract's rotation is counter-clockwise
             if rotation == 90:
-                return cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
             elif rotation == 180:
-                return cv2.rotate(image, cv2.ROTATE_180)
             else: # 270
-                return cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
-        return image # Return as is if rotation is 0
     except Exception as e:
-        print(f"OSD check failed: {e}. Falling back to word box analysis.")
-    # --- METHOD 2: Fallback using Word Bounding Box Analysis ---
-    # This method is slower but more robust for images with little text.
-    best_rotation = 0
-    max_horizontal_boxes = -1
-    # Pre-process image once for all rotations
-    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
-    orientations = {
-        0: thresh,
-        90: cv2.rotate(thresh, cv2.ROTATE_90_CLOCKWISE),
-        180: cv2.rotate(thresh, cv2.ROTATE_180),
-        270: cv2.rotate(thresh, cv2.ROTATE_90_COUNTERCLOCKWISE)
-    }
-    for angle, rotated_img in orientations.items():
-        try:
-            data = pytesseract.image_to_data(rotated_img, output_type=pytesseract.Output.DICT, timeout=5)
-            horizontal_boxes = 0
-            num_boxes = len(data['level'])
-            for i in range(num_boxes):
-                # We only consider word-level boxes (level 5) with some confidence
-                if data['level'][i] == 5 and int(data['conf'][i]) > 10:
-                    w = data['width'][i]
-                    h = data['height'][i]
-                    if w > h: # Check if the box is horizontal
-                        horizontal_boxes += 1
-            print(f"  Rotation {angle}°: Found {horizontal_boxes} horizontal word boxes.")
-            if horizontal_boxes > max_horizontal_boxes:
-                max_horizontal_boxes = horizontal_boxes
-                best_rotation = angle
-        except Exception as e:
-            print(f"  Word box analysis failed for rotation {angle}°: {e}")
-            continue
-    print(f"--> Best rotation found at {best_rotation} degrees.")
-    # Apply the best rotation to the ORIGINAL color image
-    if best_rotation == 90:
-        return cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
-    elif best_rotation == 180:
-        return cv2.rotate(image, cv2.ROTATE_180)
-    elif best_rotation == 270:
-        return cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
-    else: # 0 degrees
-        return image
-# ==============================================================================
-# NEW AND IMPROVED: Table Structure Recognition using OpenCV for Drawing
-# ==============================================================================
 def extract_and_draw_table_structure(image_bgr):
-    """
-    Takes a BGR image, finds table structure, and returns an image with
-    bounding boxes drawn directly using OpenCV.
-    """
-    # 1. Run model inference (same as before)
     image_pil = Image.fromarray(cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB))
     inputs = processor(images=image_pil, return_tensors="pt")
     with torch.no_grad():
         outputs = model(**inputs)
     target_sizes = torch.tensor([image_pil.size[::-1]])
     results = processor.post_process_object_detection(outputs, threshold=0.7, target_sizes=target_sizes)[0]
-    # 2. Draw results on a copy of the original image using OpenCV
     img_with_boxes = image_bgr.copy()
-    # BGR color codes for OpenCV
     colors = {"table row": (0, 255, 0), "table column": (0, 0, 255), "table": (255, 0, 255)}
     for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
         class_name = model.config.id2label[label.item()]
         if class_name in colors:
-            # Get box coordinates and convert to integers
             xmin, ymin, xmax, ymax = [int(val) for val in box.tolist()]
-            # Get color for the class
             color = colors[class_name]
-            # Draw rectangle on the image
             cv2.rectangle(img_with_boxes, (xmin, ymin), (xmax, ymax), color, 2)
     return img_with_boxes
 # ==============================================================================
-# Streamlit UI (Unchanged)
 # ==============================================================================
-st.title("📄 Document Scanner & Table Recognizer")
-st.write("Upload a document photo. The app will automatically straighten it, fix the orientation, and detect the table structure using a Transformer model.")
-uploaded_file = st.file_uploader("Choose a document image...", type=["jpg", "jpeg", "png"])
-if uploaded_file is not None:
-    file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8)
-    input_image = cv2.imdecode(file_bytes, 1) # 1 = COLOR_UNCHANGED
-    st.subheader("1. Original Image")
-    st.image(cv2.cvtColor(input_image, cv2.COLOR_BGR2RGB), caption="Your Uploaded Image", use_container_width=True)
-    with st.spinner("Processing your document... This may take a moment."):
-        straightened_image = find_and_straighten_document(input_image)
-        image_to_process = straightened_image if straightened_image is not None and straightened_image.size > 0 else input_image
-        final_image = correct_orientation(image_to_process)
-        # This now returns a BGR image from OpenCV
-        image_with_structure_bgr = extract_and_draw_table_structure(final_image)
-    st.subheader("2. Corrected Document & Detected Structure")
-    col1, col2 = st.columns(2)
-    with col1:
-        final_image_rgb = cv2.cvtColor(final_image, cv2.COLOR_BGR2RGB)
-        st.image(final_image_rgb, caption="Auto-Corrected & Oriented", use_container_width=True)
-        _, buf = cv2.imencode(".jpg", final_image) # Use the BGR image for encoding
-        st.download_button(
-            label="Download Clean Image",
-            data=buf.tobytes(),
-            file_name="corrected_document.jpg",
-            mime="image/jpeg",
-        )
-    with col2:
-        image_with_structure_rgb = cv2.cvtColor(image_with_structure_bgr, cv2.COLOR_BGR2RGB)
-        st.image(image_with_structure_rgb, caption="Detected Table Structure (Rows: Green, Columns: Red)", use_container_width=True)

 # App Configuration & Model Loading
 # ==============================================================================
 st.set_page_config(
+    page_title="Document AI Toolkit",
+    page_icon="🤖",
     layout="wide"
 )
 @st.cache_resource
 def load_model():
     """Loads the Table Transformer model and processor."""
+    st.write("Cache miss: Loading Table Transformer model...")
     processor = DetrImageProcessor.from_pretrained("microsoft/table-transformer-structure-recognition")
     model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-structure-recognition")
     return processor, model
 processor, model = load_model()
 # ==============================================================================
+# Core Image Processing Functions (Unchanged)
 # ==============================================================================
 def order_points(pts):
     return perspective_transform(image, box)
 def correct_orientation(image):
     try:
+        osd = pytesseract.image_to_osd(image, output_type=pytesseract.Output.DICT)
         rotation = osd['rotate']
+        if rotation in [90, 180, 270]:
             if rotation == 90:
+                rotated_image = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
             elif rotation == 180:
+                rotated_image = cv2.rotate(image, cv2.ROTATE_180)
             else: # 270
+                rotated_image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
+            return rotated_image
     except Exception as e:
+        st.warning(f"OSD check failed: {e}. Using original orientation.")
+    return image
 def extract_and_draw_table_structure(image_bgr):
     image_pil = Image.fromarray(cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB))
     inputs = processor(images=image_pil, return_tensors="pt")
     with torch.no_grad():
         outputs = model(**inputs)
     target_sizes = torch.tensor([image_pil.size[::-1]])
     results = processor.post_process_object_detection(outputs, threshold=0.7, target_sizes=target_sizes)[0]
     img_with_boxes = image_bgr.copy()
     colors = {"table row": (0, 255, 0), "table column": (0, 0, 255), "table": (255, 0, 255)}
     for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
         class_name = model.config.id2label[label.item()]
         if class_name in colors:
             xmin, ymin, xmax, ymax = [int(val) for val in box.tolist()]
             color = colors[class_name]
             cv2.rectangle(img_with_boxes, (xmin, ymin), (xmax, ymax), color, 2)
     return img_with_boxes
 # ==============================================================================
+# UI Functions for Each Step
 # ==============================================================================
+def initialize_state():
+    """Initializes the session state."""
+    if "stage" not in st.session_state:
+        st.session_state.stage = "upload"
+        st.session_state.original_image = None
+        st.session_state.processed_image = None
+def reset_app():
+    """Resets the app to the initial upload stage."""
+    for key in st.session_state.keys():
+        del st.session_state[key]
+    initialize_state()
+# --- Main App UI ---
+initialize_state()
+st.title("🤖 Document AI Toolkit")
+st.markdown("---")
+# Use columns for a centered and constrained layout
+left_col, main_col, right_col = st.columns([1, 4, 1])
+with main_col:
+    # --- STAGE 1: UPLOAD ---
+    if st.session_state.stage == "upload":
+        st.header("Step 1: Upload Your Document")
+        uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
+        if uploaded_file:
+            file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8)
+            st.session_state.original_image = cv2.imdecode(file_bytes, 1)
+            st.image(cv2.cvtColor(st.session_state.original_image, cv2.COLOR_BGR2RGB), caption="Original Upload", use_container_width=True)
+            if st.button("▶️ Start Pre-processing"):
+                st.session_state.stage = "process"
+                st.rerun()
+    # --- STAGE 2: PRE-PROCESSING ---
+    elif st.session_state.stage == "process":
+        st.header("Step 2: Pre-processing Result")
+        with st.spinner("Straightening and correcting orientation..."):
+            original_image = st.session_state.original_image
+            straightened = find_and_straighten_document(original_image)
+            image_to_orient = straightened if straightened is not None and straightened.size > 0 else original_image
+            st.session_state.processed_image = correct_orientation(image_to_orient)
+        st.image(cv2.cvtColor(st.session_state.processed_image, cv2.COLOR_BGR2RGB), caption="Corrected Document", use_container_width=True)
+        st.info("The document has been straightened and oriented.")
+        if st.button("📊 Find Table Structure"):
+            st.session_state.stage = "analyze"
+            st.rerun()
+        if st.button("↩️ Upload New Image"):
+            reset_app()
+            st.rerun()
+    # --- STAGE 3: ANALYSIS ---
+    elif st.session_state.stage == "analyze":
+        st.header("Step 3: Table Structure Analysis")
+        processed_image = st.session_state.processed_image
+        with st.spinner("Running Table Transformer model... This can take a moment."):
+            annotated_image = extract_and_draw_table_structure(processed_image)
+        st.subheader("Final Results")
+        # Display results side-by-side
+        res_col1, res_col2 = st.columns(2)
+        with res_col1:
+            st.image(cv2.cvtColor(processed_image, cv2.COLOR_BGR2RGB), caption="Cleaned Document", use_container_width=True)
+            _, buf = cv2.imencode(".jpg", processed_image)
+            st.download_button(
+                label="📥 Download Clean Image",
+                data=buf.tobytes(),
+                file_name="corrected_document.jpg",
+                mime="image/jpeg",
+            )
+        with res_col2:
+            st.image(cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB), caption="Detected Table Structure", use_container_width=True)
+        if st.button("🔄 Start Over"):
+            reset_app()
+            st.rerun()