Spaces:

Knightmovies
/

ScannerUniversalRotator

Sleeping

App Files Files Community

Knightmovies commited on Sep 14

Commit

99975be

verified ·

1 Parent(s): c726dc7

Update app.py

Browse files

Files changed (1) hide show

app.py +301 -120

app.py CHANGED Viewed

@@ -1,145 +1,326 @@
 import gradio as gr
 import numpy as np
-import torch
-from doctr.models import ocr_predictor
-from doctr.utils.visualization import visualize_page
-import tempfile
 import cv2
 from PIL import Image
-print("Initializing Doctr model... This will download ~170MB of files on the first startup and can be slow.")
-# Load the pre-trained Doctr AI model.
-model = ocr_predictor(
-    det_arch='db_resnet50',
-    reco_arch='crnn_vgg16_bn',
-    pretrained=True,
-    detect_orientation=True,
-    assume_straight_pages=False
-)
-print("✅ Doctr model is ready.")
-def process_image_with_doctr(input_image_pil):
-    """
-    Processes an image using Doctr library to extract text and visualize detections.
-    """
-    if input_image_pil is None:
-        return None, None
-    # Convert PIL Image to RGB NumPy array
-    input_image_numpy = np.array(input_image_pil)
-    # Process the document with the AI model
-    result = model([input_image_numpy])
-    # Get the first page results
-    page = result.pages[0]
-    # Method 1: Create visualization with detected text boxes
-    try:
-        # Use doctr's built-in visualization
-        visualized_image = visualize_page(page.export(), input_image_numpy)
-        final_image_rgb = visualized_image
-    except Exception as e:
-        print(f"Visualization error: {e}")
-        # Fallback: return original image
-        final_image_rgb = input_image_numpy
-    # Convert to BGR for OpenCV saving
-    final_image_bgr = cv2.cvtColor(final_image_rgb, cv2.COLOR_RGB_BGR)
-    # Save to temporary file
-    with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
-        cv2.imwrite(tmp_file.name, final_image_bgr)
-        return final_image_rgb, tmp_file.name
-# Alternative version that extracts text and draws boxes manually
-def process_image_with_manual_boxes(input_image_pil):
-    """
-    Alternative approach: manually draw bounding boxes on detected text.
-    """
-    if input_image_pil is None:
-        return None, None
-    input_image_numpy = np.array(input_image_pil)
-    result = model([input_image_numpy])
-    # Create a copy of the original image to draw on
-    output_image = input_image_numpy.copy()
-    h, w = output_image.shape[:2]
-    # Extract text and draw bounding boxes
-    page = result.pages[0]
-    for block in page.blocks:
-        for line in block.lines:
-            for word in line.words:
-                # Get word geometry (normalized coordinates)
-                geometry = word.geometry
-                # Convert normalized coordinates to pixel coordinates
-                x1, y1 = int(geometry[0][0] * w), int(geometry[0][1] * h)
-                x2, y2 = int(geometry[1][0] * w), int(geometry[1][1] * h)
-                # Draw rectangle around detected word
-                cv2.rectangle(output_image, (x1, y1), (x2, y2), (0, 255, 0), 2)
-                # Optionally add text
-                cv2.putText(output_image, word.value, (x1, y1-5),
-                           cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
-    # Save to temporary file
-    final_image_bgr = cv2.cvtColor(output_image, cv2.COLOR_RGB_BGR)
-    with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
-        cv2.imwrite(tmp_file.name, final_image_bgr)
-        return output_image, tmp_file.name
-# Text extraction function
-def extract_text_from_image(input_image_pil):
     """
-    Extract text from image and return both visualization and plain text.
     """
     if input_image_pil is None:
-        return None, None, ""
-    input_image_numpy = np.array(input_image_pil)
-    result = model([input_image_numpy])
-    # Extract all text
-    extracted_text = ""
-    for page in result.pages:
-        for block in page.blocks:
-            for line in block.lines:
-                line_text = " ".join([word.value for word in line.words])
-                extracted_text += line_text + "\n"
-    # Create visualization
-    page = result.pages[0]
-    try:
-        visualized_image = visualize_page(page.export(), input_image_numpy)
-    except:
-        visualized_image = process_image_with_manual_boxes(input_image_pil)[0]
-    # Save visualization
-    final_image_bgr = cv2.cvtColor(visualized_image, cv2.COLOR_RGB_BGR)
-    with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
-        cv2.imwrite(tmp_file.name, final_image_bgr)
-        return visualized_image, tmp_file.name, extracted_text.strip()
-# ==============================================================================
-# Gradio Interface
-# ==============================================================================
-demo = gr.Interface(
-    fn=extract_text_from_image,
-    inputs=gr.Image(type="pil", label="Upload Document Photo"),
-    outputs=[
-        gr.Image(type="numpy", label="Text Detection Visualization"),
-        gr.File(label="Download Visualization"),
-        gr.Textbox(label="Extracted Text", lines=10)
-    ],
-    title="📄 AI-Powered Document Scanner & OCR",
-    description="Upload a document image to detect and extract text using the Doctr deep learning library. The tool will show detected text regions and provide the extracted text.",
-    flagging_options=None
-)
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 import numpy as np
 import cv2
+import tempfile
 from PIL import Image
+import math
+import os
+def order_points(pts):
+    """Order points in top-left, top-right, bottom-right, bottom-left order"""
+    rect = np.zeros((4, 2), dtype="float32")
+    # Sum and difference to find corners
+    s = pts.sum(axis=1)
+    diff = np.diff(pts, axis=1)
+    rect[0] = pts[np.argmin(s)]      # top-left
+    rect[2] = pts[np.argmax(s)]      # bottom-right
+    rect[1] = pts[np.argmin(diff)]   # top-right
+    rect[3] = pts[np.argmax(diff)]   # bottom-left
+    return rect
+def four_point_transform(image, pts):
+    """Apply perspective transformation to get bird's eye view"""
+    rect = order_points(pts)
+    (tl, tr, br, bl) = rect
+    # Compute width of new image
+    widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
+    widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
+    maxWidth = max(int(widthA), int(widthB))
+    # Compute height of new image
+    heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
+    heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
+    maxHeight = max(int(heightA), int(heightB))
+    # Ensure minimum dimensions
+    maxWidth = max(maxWidth, 100)
+    maxHeight = max(maxHeight, 100)
+    # Destination points for perspective transform
+    dst = np.array([
+        [0, 0],
+        [maxWidth - 1, 0],
+        [maxWidth - 1, maxHeight - 1],
+        [0, maxHeight - 1]], dtype="float32")
+    # Perspective transformation
+    M = cv2.getPerspectiveTransform(rect, dst)
+    warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))
+    return warped
+def detect_document_edges(image):
+    """Detect document edges using contour detection"""
+    # Convert to grayscale
+    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+    # Apply Gaussian blur
+    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
+    # Edge detection
+    edged = cv2.Canny(blurred, 75, 200)
+    # Morphological operations to close gaps
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
+    edged = cv2.morphologyEx(edged, cv2.MORPH_CLOSE, kernel)
+    # Find contours
+    contours, _ = cv2.findContours(edged, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    if not contours:
+        # Fallback to image corners
+        h, w = image.shape[:2]
+        return np.array([[0, 0], [w-1, 0], [w-1, h-1], [0, h-1]], dtype="float32")
+    # Sort contours by area (largest first)
+    contours = sorted(contours, key=cv2.contourArea, reverse=True)
+    # Find the largest rectangular contour
+    for contour in contours:
+        # Skip very small contours
+        if cv2.contourArea(contour) < 1000:
+            continue
+        # Approximate contour
+        epsilon = 0.02 * cv2.arcLength(contour, True)
+        approx = cv2.approxPolyDP(contour, epsilon, True)
+        # If we found a 4-sided contour, it's likely our document
+        if len(approx) == 4:
+            return approx.reshape(4, 2).astype("float32")
+    # If no rectangular contour found, use image corners
+    h, w = image.shape[:2]
+    return np.array([[0, 0], [w-1, 0], [w-1, h-1], [0, h-1]], dtype="float32")
+def enhance_document(image):
+    """Enhance the document image for better readability"""
+    try:
+        # Convert to LAB color space
+        lab = cv2.cvtColor(image, cv2.COLOR_RGB2LAB)
+        l, a, b = cv2.split(lab)
+        # Apply CLAHE to L channel
+        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+        l = clahe.apply(l)
+        # Merge channels and convert back to RGB
+        enhanced = cv2.merge([l, a, b])
+        enhanced = cv2.cvtColor(enhanced, cv2.COLOR_LAB2RGB)
+        return enhanced
+    except:
+        # Fallback: simple contrast enhancement
+        return cv2.convertScaleAbs(image, alpha=1.2, beta=10)
+def auto_rotate_image(image):
+    """Auto-rotate image to correct orientation using text line detection"""
+    try:
+        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+        # Detect lines using HoughLinesP
+        edges = cv2.Canny(gray, 50, 150, apertureSize=3)
+        lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=100, minLineLength=100, maxLineGap=10)
+        if lines is not None and len(lines) > 0:
+            angles = []
+            for line in lines:
+                x1, y1, x2, y2 = line[0]
+                angle = math.atan2(y2 - y1, x2 - x1)
+                angles.append(angle)
+            # Get median angle
+            if angles:
+                median_angle = np.median(angles)
+                angle_deg = np.degrees(median_angle)
+                # Correct angle to nearest 90-degree orientation
+                if angle_deg > 45:
+                    angle_deg -= 90
+                elif angle_deg < -45:
+                    angle_deg += 90
+                # Rotate image if significant rotation detected
+                if abs(angle_deg) > 1:  # Only rotate if angle > 1 degree
+                    h, w = image.shape[:2]
+                    center = (w // 2, h // 2)
+                    M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
+                    # Calculate new image dimensions
+                    cos = np.abs(M[0, 0])
+                    sin = np.abs(M[0, 1])
+                    new_w = int((h * sin) + (w * cos))
+                    new_h = int((h * cos) + (w * sin))
+                    # Adjust rotation matrix for new center
+                    M[0, 2] += (new_w / 2) - center[0]
+                    M[1, 2] += (new_h / 2) - center[1]
+                    rotated = cv2.warpAffine(image, M, (new_w, new_h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REFLECT)
+                    return rotated
+    except Exception as e:
+        print(f"Auto-rotation failed: {e}")
+    return image
+def scan_document(input_image_pil):
     """
+    Complete document scanning pipeline:
+    1. Detect document edges
+    2. Apply perspective correction
+    3. Auto-rotate for correct orientation
+    4. Enhance image quality
     """
     if input_image_pil is None:
+        return None, None, "❌ No image uploaded"
+    try:
+        # Convert PIL to numpy array
+        image = np.array(input_image_pil)
+        original_image = image.copy()
+        # Validate image
+        if image.size == 0:
+            return original_image, None, "❌ Invalid image"
+        # Step 1: Auto-rotate to correct orientation
+        print("🔄 Auto-rotating image...")
+        rotated_image = auto_rotate_image(image)
+        # Step 2: Detect document edges
+        print("📐 Detecting document edges...")
+        edges = detect_document_edges(rotated_image)
+        # Step 3: Apply perspective transformation
+        print("✂️ Applying perspective correction...")
+        scanned = four_point_transform(rotated_image, edges)
+        # Step 4: Enhance the scanned document
+        print("✨ Enhancing document...")
+        enhanced = enhance_document(scanned)
+        # Save to temporary file
+        enhanced_bgr = cv2.cvtColor(enhanced, cv2.COLOR_RGB2BGR)
+        temp_path = tempfile.mktemp(suffix=".jpg")
+        cv2.imwrite(temp_path, enhanced_bgr)
+        return enhanced, temp_path, "✅ Document scanned successfully!"
+    except Exception as e:
+        print(f"Error in scan_document: {e}")
+        return original_image if 'original_image' in locals() else None, None, f"❌ Error: {str(e)}"
+# Custom CSS for better UI
+custom_css = """
+#image_upload {
+    max-height: 400px !important;
+}
+.gradio-container {
+    max-width: 1200px !important;
+    margin: auto !important;
+}
+#output_image {
+    max-height: 500px !important;
+}
+.primary {
+    background: linear-gradient(45deg, #4CAF50, #45a049) !important;
+    border: none !important;
+}
+.primary:hover {
+    background: linear-gradient(45deg, #45a049, #4CAF50) !important;
+    transform: translateY(-2px) !important;
+}
+"""
+# Create Gradio interface
+with gr.Blocks(css=custom_css, title="📄 AI Document Scanner", theme=gr.themes.Soft()) as demo:
+    gr.HTML("""
+    <div style="text-align: center; margin-bottom: 30px;">
+        <h1 style="color: #2E7D32; font-size: 2.5em; margin-bottom: 10px;">📄 AI Document Scanner</h1>
+        <p style="color: #666; font-size: 1.2em;">Professional document scanning with automatic perspective correction, rotation, and enhancement</p>
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.HTML("<h3 style='color: #1976D2; text-align: center;'>📤 Upload Document</h3>")
+            input_image = gr.Image(
+                type="pil",
+                label="Upload your document photo",
+                elem_id="image_upload",
+                height=400,
+                sources=["upload", "webcam"]
+            )
+            scan_btn = gr.Button(
+                "🔍 Scan Document",
+                variant="primary",
+                size="lg"
+            )
+            status_text = gr.Textbox(
+                label="📊 Status",
+                value="Ready to scan documents",
+                interactive=False,
+                lines=2
+            )
+        with gr.Column(scale=1):
+            gr.HTML("<h3 style='color: #1976D2; text-align: center;'>📋 Scanned Result</h3>")
+            output_image = gr.Image(
+                type="numpy",
+                label="Scanned Document",
+                elem_id="output_image",
+                height=400
+            )
+            download_file = gr.File(
+                label="📥 Download Scanned Document"
+            )
+    # Features section
+    gr.HTML("""
+    <div style="margin-top: 30px; padding: 20px; background: linear-gradient(135deg, #E8F5E8, #F0F8FF); border-radius: 15px;">
+        <h3 style="color: #2E7D32; text-align: center; margin-bottom: 15px;">✨ Key Features</h3>
+        <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px;">
+            <div style="text-align: center;">
+                <span style="font-size: 2em;">🔄</span>
+                <p><strong>Auto Rotation</strong><br>Automatically detects and corrects orientation</p>
+            </div>
+            <div style="text-align: center;">
+                <span style="font-size: 2em;">📐</span>
+                <p><strong>Perspective Correction</strong><br>Straightens tilted and skewed documents</p>
+            </div>
+            <div style="text-align: center;">
+                <span style="font-size: 2em;">✂️</span>
+                <p><strong>Smart Cropping</strong><br>Automatically crops to document boundaries</p>
+            </div>
+            <div style="text-align: center;">
+                <span style="font-size: 2em;">✨</span>
+                <p><strong>Enhancement</strong><br>Improves contrast and readability</p>
+            </div>
+        </div>
+    </div>
+    """)
+    # Set up the scanning function
+    scan_btn.click(
+        fn=scan_document,
+        inputs=[input_image],
+        outputs=[output_image, download_file, status_text]
+    )
+# Launch the app
 if __name__ == "__main__":
+    demo.launch()