sbapan41 commited on
Commit
2df383f
·
verified ·
1 Parent(s): 5e676ca

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +367 -0
  2. curl.txt +2 -2
  3. requirements.txt +18 -0
app.py ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import json
4
+ import logging
5
+ import time
6
+ import subprocess
7
+ from flask import Flask, request, jsonify
8
+ from werkzeug.utils import secure_filename
9
+ import pdfplumber
10
+ from pdf2image import convert_from_path
11
+ from PIL import Image
12
+ import cv2
13
+ import numpy as np
14
+ import io
15
+ import pandas as pd
16
+ try:
17
+ from docx import Document
18
+ except ImportError:
19
+ Document = None # Handle case where python-docx is not installed
20
+ import openpyxl
21
+ import easyocr
22
+
23
+ app = Flask(__name__)
24
+
25
+ # Configure logging
26
+ logging.basicConfig(
27
+ level=logging.INFO,
28
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
29
+ )
30
+ logger = logging.getLogger(__name__)
31
+
32
+ # Configuration
33
+ ALLOWED_EXTENSIONS = {'pdf', 'docx', 'txt', 'csv', 'xlsx', 'xls', 'jpg', 'jpeg', 'png'}
34
+ UPLOAD_FOLDER = tempfile.mkdtemp()
35
+ OUTPUT_FOLDER = os.path.join(os.getcwd(), 'extracted_data')
36
+ os.makedirs(OUTPUT_FOLDER, exist_ok=True)
37
+ app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
38
+ app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB limit
39
+
40
+ # API Key Configuration
41
+ API_KEYS = {
42
+ "your_api_key_1": "client1",
43
+ "your_api_key_2": "client2"
44
+ }
45
+
46
+ # Initialize EasyOCR readers with GPU support
47
+ reader_en_hi = easyocr.Reader(['en', 'hi'], gpu=True)
48
+ reader_en_bn = easyocr.Reader(['en', 'bn'], gpu=True)
49
+ reader_en_ur = easyocr.Reader(['en', 'ur'], gpu=True)
50
+
51
+ def allowed_file(filename):
52
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
53
+
54
+ def validate_api_key():
55
+ """Check if the provided API key is valid"""
56
+ api_key = request.headers.get('X-API-KEY')
57
+ if not api_key or api_key not in API_KEYS:
58
+ return False
59
+ return True
60
+
61
+ def preprocess_image(image):
62
+ """Enhance image for better OCR results"""
63
+ try:
64
+ img = np.array(image)
65
+ if len(img.shape) == 2: # Grayscale
66
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
67
+ elif img.shape[2] == 4: # RGBA
68
+ img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
69
+
70
+ # Convert to grayscale for processing
71
+ gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
72
+
73
+ # Apply adaptive thresholding
74
+ processed = cv2.adaptiveThreshold(
75
+ gray, 255,
76
+ cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
77
+ cv2.THRESH_BINARY, 11, 2
78
+ )
79
+
80
+ return Image.fromarray(processed)
81
+ except Exception as e:
82
+ logger.error(f"Image preprocessing failed: {str(e)}")
83
+ return image
84
+
85
+ def extract_text_from_image(image):
86
+ """Extract text from image using EasyOCR"""
87
+ try:
88
+ processed_img = preprocess_image(image)
89
+ result_en_hi = reader_en_hi.readtext(np.array(processed_img))
90
+ result_en_bn = reader_en_bn.readtext(np.array(processed_img))
91
+ result_en_ur = reader_en_ur.readtext(np.array(processed_img))
92
+
93
+ text_en_hi = " ".join([text[1] for text in result_en_hi])
94
+ text_en_bn = " ".join([text[1] for text in result_en_bn])
95
+ text_en_ur = " ".join([text[1] for text in result_en_ur])
96
+
97
+ return text_en_hi + " " + text_en_bn + " " + text_en_ur
98
+ except Exception as e:
99
+ logger.error(f"OCR extraction failed: {str(e)}")
100
+ return ""
101
+
102
+ def process_pdf_page(page, page_num, pdf_path):
103
+ """Process a single PDF page with mixed content"""
104
+ result = {
105
+ "page": page_num + 1,
106
+ "native_text": "",
107
+ "image_text": "",
108
+ "type": "mixed"
109
+ }
110
+
111
+ # First try to extract native text
112
+ try:
113
+ result["native_text"] = page.extract_text(x_tolerance=1, y_tolerance=1) or ""
114
+ except Exception as e:
115
+ logger.warning(f"Native text extraction failed: {str(e)}")
116
+
117
+ # Check if page has images or if native text extraction was insufficient
118
+ if page.images or len(result["native_text"].strip()) < 50:
119
+ try:
120
+ # Convert the entire page to image
121
+ images = convert_from_path(
122
+ pdf_path,
123
+ first_page=page_num+1,
124
+ last_page=page_num+1,
125
+ dpi=300,
126
+ size=(2480, 3508)) # A4 size at 300dpi
127
+
128
+ if images:
129
+ # Extract text from the full page image
130
+ full_page_text = extract_text_from_image(images[0])
131
+
132
+ # Only use OCR text if we got more content than native extraction
133
+ if len(full_page_text) > len(result["native_text"]):
134
+ result["image_text"] = full_page_text
135
+ result["type"] = "ocr_text" if not result["native_text"] else "mixed"
136
+
137
+ # Explicit cleanup
138
+ del images
139
+ except Exception as e:
140
+ logger.error(f"Page image processing failed: {str(e)}")
141
+
142
+ return result
143
+
144
+ def process_docx(file_path):
145
+ """Extract text from DOCX file"""
146
+ if Document is None:
147
+ raise ImportError("python-docx package is not installed")
148
+
149
+ try:
150
+ doc = Document(file_path)
151
+ text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
152
+ return {
153
+ "content": [{
154
+ "page": 1,
155
+ "text": text,
156
+ "type": "native_text"
157
+ }]
158
+ }
159
+ except Exception as e:
160
+ logger.error(f"DOCX processing failed: {str(e)}")
161
+ raise
162
+
163
+ def process_txt(file_path):
164
+ """Extract text from TXT file"""
165
+ try:
166
+ with open(file_path, 'r', encoding='utf-8') as f:
167
+ text = f.read()
168
+ return {
169
+ "content": [{
170
+ "page": 1,
171
+ "text": text,
172
+ "type": "native_text"
173
+ }]
174
+ }
175
+ except Exception as e:
176
+ logger.error(f"TXT processing failed: {str(e)}")
177
+ raise
178
+
179
+ def process_csv(file_path):
180
+ """Extract data from CSV file"""
181
+ try:
182
+ df = pd.read_csv(file_path)
183
+ text = df.to_string(index=False)
184
+ return {
185
+ "content": [{
186
+ "page": 1,
187
+ "text": text,
188
+ "type": "table_data"
189
+ }]
190
+ }
191
+ except Exception as e:
192
+ logger.error(f"CSV processing failed: {str(e)}")
193
+ raise
194
+
195
+ def process_excel(file_path):
196
+ """Extract data from Excel file (XLSX or XLS)"""
197
+ try:
198
+ text = ""
199
+ if file_path.endswith('.xlsx'):
200
+ wb = openpyxl.load_workbook(file_path)
201
+ for sheet_name in wb.sheetnames:
202
+ sheet = wb[sheet_name]
203
+ text += f"\n\nSheet: {sheet_name}\n"
204
+ for row in sheet.iter_rows(values_only=True):
205
+ text += "\t".join(str(cell) if cell is not None else "" for cell in row) + "\n"
206
+ else: # .xls
207
+ df = pd.read_excel(file_path, sheet_name=None)
208
+ for sheet_name, data in df.items():
209
+ text += f"\n\nSheet: {sheet_name}\n{data.to_string(index=False)}\n"
210
+
211
+ return {
212
+ "content": [{
213
+ "page": 1,
214
+ "text": text,
215
+ "type": "table_data"
216
+ }]
217
+ }
218
+ except Exception as e:
219
+ logger.error(f"Excel processing failed: {str(e)}")
220
+ raise
221
+
222
+ def process_image(file_path):
223
+ """Extract text from image file (JPG, JPEG, PNG)"""
224
+ try:
225
+ image = Image.open(file_path)
226
+ text = extract_text_from_image(image)
227
+ return {
228
+ "content": [{
229
+ "page": 1,
230
+ "text": text,
231
+ "type": "ocr_text"
232
+ }]
233
+ }
234
+ except Exception as e:
235
+ logger.error(f"Image processing failed: {str(e)}")
236
+ raise
237
+
238
+ @app.route('/')
239
+ def home():
240
+ return "Welcome to Quantumhash OCR_Data_Extraction"
241
+
242
+ @app.route('/process', methods=['POST'])
243
+ def handle_file():
244
+ # API Key validation
245
+ if not validate_api_key():
246
+ return jsonify({"error": "Invalid or missing API key"}), 401
247
+
248
+ if 'file' not in request.files:
249
+ return jsonify({"error": "No file provided"}), 400
250
+
251
+ file = request.files['file']
252
+ if not file or file.filename == '':
253
+ return jsonify({"error": "No selected file"}), 400
254
+
255
+ if not allowed_file(file.filename):
256
+ return jsonify({"error": "Invalid file type"}), 400
257
+
258
+ temp_path = None
259
+ try:
260
+ # Save uploaded file temporarily
261
+ filename = secure_filename(file.filename)
262
+ temp_dir = tempfile.mkdtemp()
263
+ temp_path = os.path.join(temp_dir, filename)
264
+ file.save(temp_path)
265
+
266
+ start_time = time.time()
267
+ file_extension = filename.rsplit('.', 1)[1].lower()
268
+
269
+ # Process file based on extension
270
+ if file_extension == 'pdf':
271
+ results = []
272
+ with pdfplumber.open(temp_path) as pdf:
273
+ for page_num, page in enumerate(pdf.pages):
274
+ page_result = process_pdf_page(page, page_num, temp_path)
275
+ results.append(page_result)
276
+
277
+ # Combine results
278
+ combined_text = ""
279
+ for page in results:
280
+ combined_text += page.get("native_text", "") + "\n" + page.get("image_text", "") + "\n"
281
+
282
+ response = {
283
+ "metadata": {
284
+ "filename": filename,
285
+ "pages": len(results),
286
+ "processing_time": round(time.time() - start_time, 2),
287
+ "text_length": len(combined_text)
288
+ },
289
+ "content": results
290
+ }
291
+ elif file_extension == 'docx':
292
+ response = process_docx(temp_path)
293
+ response['metadata'] = {
294
+ "filename": filename,
295
+ "pages": 1,
296
+ "processing_time": round(time.time() - start_time, 2),
297
+ "text_length": len(response['content'][0]['text'])
298
+ }
299
+ elif file_extension == 'txt':
300
+ response = process_txt(temp_path)
301
+ response['metadata'] = {
302
+ "filename": filename,
303
+ "pages": 1,
304
+ "processing_time": round(time.time() - start_time, 2),
305
+ "text_length": len(response['content'][0]['text'])
306
+ }
307
+ elif file_extension == 'csv':
308
+ response = process_csv(temp_path)
309
+ response['metadata'] = {
310
+ "filename": filename,
311
+ "pages": 1,
312
+ "processing_time": round(time.time() - start_time, 2),
313
+ "text_length": len(response['content'][0]['text'])
314
+ }
315
+ elif file_extension in ('xlsx', 'xls'):
316
+ response = process_excel(temp_path)
317
+ response['metadata'] = {
318
+ "filename": filename,
319
+ "pages": 1,
320
+ "processing_time": round(time.time() - start_time, 2),
321
+ "text_length": len(response['content'][0]['text'])
322
+ }
323
+ elif file_extension in ('jpg', 'jpeg', 'png'):
324
+ response = process_image(temp_path)
325
+ response['metadata'] = {
326
+ "filename": filename,
327
+ "pages": 1,
328
+ "processing_time": round(time.time() - start_time, 2),
329
+ "text_length": len(response['content'][0]['text'])
330
+ }
331
+ else:
332
+ return jsonify({"error": "Unsupported file type"}), 400
333
+
334
+ return jsonify(response)
335
+
336
+ except Exception as e:
337
+ logger.error(f"Processing failed: {str(e)}")
338
+ return jsonify({"error": str(e)}), 500
339
+
340
+ finally:
341
+ # Clean up temporary files
342
+ try:
343
+ if temp_path and os.path.exists(temp_path):
344
+ os.remove(temp_path)
345
+ if 'temp_dir' in locals() and os.path.exists(temp_dir):
346
+ os.rmdir(temp_dir)
347
+ except Exception as e:
348
+ logger.error(f"Cleanup failed: {str(e)}")
349
+
350
+ def start_ngrok():
351
+ # Set your Ngrok auth token and subdomain
352
+ ngrok_auth_token = "2sVziKoqqniEEUHcO4hKm1iXtV8_6fD7SDC7aFFktZngz1dRB"
353
+ subdomain = "clean-guided-gar"
354
+
355
+ # Set up Ngrok command
356
+ ngrok_cmd = f"ngrok http --domain={subdomain}.ngrok-free.app 5000"
357
+
358
+ # Start Ngrok in a separate process
359
+ subprocess.Popen(ngrok_cmd, shell=True)
360
+ print(f"\n * Ngrok tunnel running at: https://{subdomain}.ngrok-free.app")
361
+
362
+ if __name__ == '__main__':
363
+ # Start Ngrok tunnel
364
+ start_ngrok()
365
+
366
+ # Run Flask app
367
+ app.run(host='0.0.0.0', port=5000)
curl.txt CHANGED
@@ -1,4 +1,4 @@
1
  curl -X POST \
2
- http://localhost:5000/process \
3
  -H 'X-API-KEY: your_api_key_1' \
4
- -F 'file=@/path/to/your/file.pdf'
 
1
  curl -X POST \
2
+ https://clean-guided-gar.ngrok-free.app/process \
3
  -H 'X-API-KEY: your_api_key_1' \
4
+ -F 'file=@Report.pdf'
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Flask==2.2.5
2
+ flask-httpauth==4.7.0
3
+ pdfplumber==0.10.3
4
+ pdf2image==1.16.3
5
+ Pillow==9.5.0
6
+ opencv-python-headless==4.8.0.76
7
+ numpy==1.24.4
8
+ pandas==1.5.3
9
+ python-docx==0.8.11
10
+ openpyxl==3.1.2
11
+ easyocr==1.7.1
12
+ torch==2.0.1
13
+ torchvision==0.15.2
14
+ scikit-image==0.21.0
15
+ matplotlib==3.7.1
16
+ tqdm==4.65.0
17
+ PyMuPDF==1.22.0
18
+ pyngrok