Knightmovies commited on
Commit
65c6a60
·
verified ·
1 Parent(s): 96d6384

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -138
app.py CHANGED
@@ -11,15 +11,9 @@ from scipy.spatial import distance as dist
11
  # App Configuration & Model Loading
12
  # ==============================================================================
13
 
14
- # For Hugging Face Spaces deployment, you also need these two files:
15
- # 1. requirements.txt (listing all Python libraries)
16
- # 2. packages.txt (containing the line "tesseract-ocr")
17
- # NOTE: With this new code, you can remove 'matplotlib' from requirements.txt
18
-
19
- # Set Streamlit page configuration
20
  st.set_page_config(
21
- page_title="Document Scanner & Table Recognizer",
22
- page_icon="📄",
23
  layout="wide"
24
  )
25
 
@@ -27,16 +21,15 @@ st.set_page_config(
27
  @st.cache_resource
28
  def load_model():
29
  """Loads the Table Transformer model and processor."""
30
- print("Loading model...")
31
  processor = DetrImageProcessor.from_pretrained("microsoft/table-transformer-structure-recognition")
32
  model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-structure-recognition")
33
- print("Model loaded successfully.")
34
  return processor, model
35
 
36
  processor, model = load_model()
37
 
38
  # ==============================================================================
39
- # Core Image Processing Functions
40
  # ==============================================================================
41
 
42
  def order_points(pts):
@@ -75,157 +68,122 @@ def find_and_straighten_document(image):
75
  return perspective_transform(image, box)
76
 
77
  def correct_orientation(image):
78
- """
79
- Corrects the orientation of an image using a robust cascade method.
80
- 1. Tries fast OSD (Orientation and Script Detection).
81
- 2. If OSD fails, falls back to analyzing word bounding boxes.
82
- """
83
- print("--- Running Orientation Check ---")
84
-
85
- # --- METHOD 1: Fast Orientation and Script Detection (OSD) ---
86
  try:
87
- osd = pytesseract.image_to_osd(image, output_type=pytesseract.Output.DICT, timeout=5)
88
  rotation = osd['rotate']
89
- print(f"OSD check successful. Detected rotation: {rotation} degrees.")
90
- if rotation > 0:
91
- # Tesseract's rotation is counter-clockwise
92
  if rotation == 90:
93
- return cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
94
  elif rotation == 180:
95
- return cv2.rotate(image, cv2.ROTATE_180)
96
  else: # 270
97
- return cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
98
- return image # Return as is if rotation is 0
99
  except Exception as e:
100
- print(f"OSD check failed: {e}. Falling back to word box analysis.")
101
-
102
- # --- METHOD 2: Fallback using Word Bounding Box Analysis ---
103
- # This method is slower but more robust for images with little text.
104
- best_rotation = 0
105
- max_horizontal_boxes = -1
106
-
107
- # Pre-process image once for all rotations
108
- gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
109
- thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
110
-
111
- orientations = {
112
- 0: thresh,
113
- 90: cv2.rotate(thresh, cv2.ROTATE_90_CLOCKWISE),
114
- 180: cv2.rotate(thresh, cv2.ROTATE_180),
115
- 270: cv2.rotate(thresh, cv2.ROTATE_90_COUNTERCLOCKWISE)
116
- }
117
-
118
- for angle, rotated_img in orientations.items():
119
- try:
120
- data = pytesseract.image_to_data(rotated_img, output_type=pytesseract.Output.DICT, timeout=5)
121
- horizontal_boxes = 0
122
- num_boxes = len(data['level'])
123
- for i in range(num_boxes):
124
- # We only consider word-level boxes (level 5) with some confidence
125
- if data['level'][i] == 5 and int(data['conf'][i]) > 10:
126
- w = data['width'][i]
127
- h = data['height'][i]
128
- if w > h: # Check if the box is horizontal
129
- horizontal_boxes += 1
130
-
131
- print(f" Rotation {angle}°: Found {horizontal_boxes} horizontal word boxes.")
132
-
133
- if horizontal_boxes > max_horizontal_boxes:
134
- max_horizontal_boxes = horizontal_boxes
135
- best_rotation = angle
136
- except Exception as e:
137
- print(f" Word box analysis failed for rotation {angle}°: {e}")
138
- continue
139
-
140
- print(f"--> Best rotation found at {best_rotation} degrees.")
141
-
142
- # Apply the best rotation to the ORIGINAL color image
143
- if best_rotation == 90:
144
- return cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
145
- elif best_rotation == 180:
146
- return cv2.rotate(image, cv2.ROTATE_180)
147
- elif best_rotation == 270:
148
- return cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
149
- else: # 0 degrees
150
- return image
151
 
152
- # ==============================================================================
153
- # NEW AND IMPROVED: Table Structure Recognition using OpenCV for Drawing
154
- # ==============================================================================
155
  def extract_and_draw_table_structure(image_bgr):
156
- """
157
- Takes a BGR image, finds table structure, and returns an image with
158
- bounding boxes drawn directly using OpenCV.
159
- """
160
- # 1. Run model inference (same as before)
161
  image_pil = Image.fromarray(cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB))
162
  inputs = processor(images=image_pil, return_tensors="pt")
163
-
164
  with torch.no_grad():
165
  outputs = model(**inputs)
166
-
167
  target_sizes = torch.tensor([image_pil.size[::-1]])
168
  results = processor.post_process_object_detection(outputs, threshold=0.7, target_sizes=target_sizes)[0]
169
-
170
- # 2. Draw results on a copy of the original image using OpenCV
171
  img_with_boxes = image_bgr.copy()
172
-
173
- # BGR color codes for OpenCV
174
  colors = {"table row": (0, 255, 0), "table column": (0, 0, 255), "table": (255, 0, 255)}
175
-
176
  for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
177
  class_name = model.config.id2label[label.item()]
178
  if class_name in colors:
179
- # Get box coordinates and convert to integers
180
  xmin, ymin, xmax, ymax = [int(val) for val in box.tolist()]
181
-
182
- # Get color for the class
183
  color = colors[class_name]
184
-
185
- # Draw rectangle on the image
186
  cv2.rectangle(img_with_boxes, (xmin, ymin), (xmax, ymax), color, 2)
187
-
188
  return img_with_boxes
189
 
190
  # ==============================================================================
191
- # Streamlit UI (Unchanged)
192
  # ==============================================================================
193
 
194
- st.title("📄 Document Scanner & Table Recognizer")
195
- st.write("Upload a document photo. The app will automatically straighten it, fix the orientation, and detect the table structure using a Transformer model.")
196
-
197
- uploaded_file = st.file_uploader("Choose a document image...", type=["jpg", "jpeg", "png"])
198
-
199
- if uploaded_file is not None:
200
- file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8)
201
- input_image = cv2.imdecode(file_bytes, 1) # 1 = COLOR_UNCHANGED
202
-
203
- st.subheader("1. Original Image")
204
- st.image(cv2.cvtColor(input_image, cv2.COLOR_BGR2RGB), caption="Your Uploaded Image", use_container_width=True)
205
-
206
- with st.spinner("Processing your document... This may take a moment."):
207
- straightened_image = find_and_straighten_document(input_image)
208
- image_to_process = straightened_image if straightened_image is not None and straightened_image.size > 0 else input_image
209
- final_image = correct_orientation(image_to_process)
210
-
211
- # This now returns a BGR image from OpenCV
212
- image_with_structure_bgr = extract_and_draw_table_structure(final_image)
213
-
214
- st.subheader("2. Corrected Document & Detected Structure")
215
- col1, col2 = st.columns(2)
216
-
217
- with col1:
218
- final_image_rgb = cv2.cvtColor(final_image, cv2.COLOR_BGR2RGB)
219
- st.image(final_image_rgb, caption="Auto-Corrected & Oriented", use_container_width=True)
220
-
221
- _, buf = cv2.imencode(".jpg", final_image) # Use the BGR image for encoding
222
- st.download_button(
223
- label="Download Clean Image",
224
- data=buf.tobytes(),
225
- file_name="corrected_document.jpg",
226
- mime="image/jpeg",
227
- )
228
-
229
- with col2:
230
- image_with_structure_rgb = cv2.cvtColor(image_with_structure_bgr, cv2.COLOR_BGR2RGB)
231
- st.image(image_with_structure_rgb, caption="Detected Table Structure (Rows: Green, Columns: Red)", use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  # App Configuration & Model Loading
12
  # ==============================================================================
13
 
 
 
 
 
 
 
14
  st.set_page_config(
15
+ page_title="Document AI Toolkit",
16
+ page_icon="🤖",
17
  layout="wide"
18
  )
19
 
 
21
  @st.cache_resource
22
  def load_model():
23
  """Loads the Table Transformer model and processor."""
24
+ st.write("Cache miss: Loading Table Transformer model...")
25
  processor = DetrImageProcessor.from_pretrained("microsoft/table-transformer-structure-recognition")
26
  model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-structure-recognition")
 
27
  return processor, model
28
 
29
  processor, model = load_model()
30
 
31
  # ==============================================================================
32
+ # Core Image Processing Functions (Unchanged)
33
  # ==============================================================================
34
 
35
  def order_points(pts):
 
68
  return perspective_transform(image, box)
69
 
70
  def correct_orientation(image):
 
 
 
 
 
 
 
 
71
  try:
72
+ osd = pytesseract.image_to_osd(image, output_type=pytesseract.Output.DICT)
73
  rotation = osd['rotate']
74
+ if rotation in [90, 180, 270]:
 
 
75
  if rotation == 90:
76
+ rotated_image = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
77
  elif rotation == 180:
78
+ rotated_image = cv2.rotate(image, cv2.ROTATE_180)
79
  else: # 270
80
+ rotated_image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
81
+ return rotated_image
82
  except Exception as e:
83
+ st.warning(f"OSD check failed: {e}. Using original orientation.")
84
+ return image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
 
 
 
86
  def extract_and_draw_table_structure(image_bgr):
 
 
 
 
 
87
  image_pil = Image.fromarray(cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB))
88
  inputs = processor(images=image_pil, return_tensors="pt")
 
89
  with torch.no_grad():
90
  outputs = model(**inputs)
 
91
  target_sizes = torch.tensor([image_pil.size[::-1]])
92
  results = processor.post_process_object_detection(outputs, threshold=0.7, target_sizes=target_sizes)[0]
 
 
93
  img_with_boxes = image_bgr.copy()
 
 
94
  colors = {"table row": (0, 255, 0), "table column": (0, 0, 255), "table": (255, 0, 255)}
 
95
  for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
96
  class_name = model.config.id2label[label.item()]
97
  if class_name in colors:
 
98
  xmin, ymin, xmax, ymax = [int(val) for val in box.tolist()]
 
 
99
  color = colors[class_name]
 
 
100
  cv2.rectangle(img_with_boxes, (xmin, ymin), (xmax, ymax), color, 2)
 
101
  return img_with_boxes
102
 
103
  # ==============================================================================
104
+ # UI Functions for Each Step
105
  # ==============================================================================
106
 
107
+ def initialize_state():
108
+ """Initializes the session state."""
109
+ if "stage" not in st.session_state:
110
+ st.session_state.stage = "upload"
111
+ st.session_state.original_image = None
112
+ st.session_state.processed_image = None
113
+
114
+ def reset_app():
115
+ """Resets the app to the initial upload stage."""
116
+ for key in st.session_state.keys():
117
+ del st.session_state[key]
118
+ initialize_state()
119
+
120
+ # --- Main App UI ---
121
+ initialize_state()
122
+
123
+ st.title("🤖 Document AI Toolkit")
124
+ st.markdown("---")
125
+
126
+ # Use columns for a centered and constrained layout
127
+ left_col, main_col, right_col = st.columns([1, 4, 1])
128
+
129
+ with main_col:
130
+ # --- STAGE 1: UPLOAD ---
131
+ if st.session_state.stage == "upload":
132
+ st.header("Step 1: Upload Your Document")
133
+ uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
134
+
135
+ if uploaded_file:
136
+ file_bytes = np.asarray(bytearray(uploaded_file.read()), dtype=np.uint8)
137
+ st.session_state.original_image = cv2.imdecode(file_bytes, 1)
138
+ st.image(cv2.cvtColor(st.session_state.original_image, cv2.COLOR_BGR2RGB), caption="Original Upload", use_container_width=True)
139
+
140
+ if st.button("▶️ Start Pre-processing"):
141
+ st.session_state.stage = "process"
142
+ st.rerun()
143
+
144
+ # --- STAGE 2: PRE-PROCESSING ---
145
+ elif st.session_state.stage == "process":
146
+ st.header("Step 2: Pre-processing Result")
147
+ with st.spinner("Straightening and correcting orientation..."):
148
+ original_image = st.session_state.original_image
149
+ straightened = find_and_straighten_document(original_image)
150
+ image_to_orient = straightened if straightened is not None and straightened.size > 0 else original_image
151
+ st.session_state.processed_image = correct_orientation(image_to_orient)
152
+
153
+ st.image(cv2.cvtColor(st.session_state.processed_image, cv2.COLOR_BGR2RGB), caption="Corrected Document", use_container_width=True)
154
+ st.info("The document has been straightened and oriented.")
155
+
156
+ if st.button("📊 Find Table Structure"):
157
+ st.session_state.stage = "analyze"
158
+ st.rerun()
159
+
160
+ if st.button("↩️ Upload New Image"):
161
+ reset_app()
162
+ st.rerun()
163
+
164
+ # --- STAGE 3: ANALYSIS ---
165
+ elif st.session_state.stage == "analyze":
166
+ st.header("Step 3: Table Structure Analysis")
167
+ processed_image = st.session_state.processed_image
168
+ with st.spinner("Running Table Transformer model... This can take a moment."):
169
+ annotated_image = extract_and_draw_table_structure(processed_image)
170
+
171
+ st.subheader("Final Results")
172
+
173
+ # Display results side-by-side
174
+ res_col1, res_col2 = st.columns(2)
175
+ with res_col1:
176
+ st.image(cv2.cvtColor(processed_image, cv2.COLOR_BGR2RGB), caption="Cleaned Document", use_container_width=True)
177
+ _, buf = cv2.imencode(".jpg", processed_image)
178
+ st.download_button(
179
+ label="📥 Download Clean Image",
180
+ data=buf.tobytes(),
181
+ file_name="corrected_document.jpg",
182
+ mime="image/jpeg",
183
+ )
184
+ with res_col2:
185
+ st.image(cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB), caption="Detected Table Structure", use_container_width=True)
186
+
187
+ if st.button("🔄 Start Over"):
188
+ reset_app()
189
+ st.rerun()