sbapan41 commited on
Commit
431e767
·
verified ·
1 Parent(s): b7554d3

Upload 4 files

Browse files
Text_extraction_deploy_2.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import json
4
+ import logging
5
+ import time
6
+ from flask import Flask, request, jsonify
7
+ from werkzeug.utils import secure_filename
8
+ import pdfplumber
9
+ from pdf2image import convert_from_path
10
+ from PIL import Image
11
+ import cv2
12
+ import numpy as np
13
+ import io
14
+ import pandas as pd
15
+ try:
16
+ from docx import Document
17
+ except ImportError:
18
+ Document = None # Handle case where python-docx is not installed
19
+ import openpyxl
20
+ import easyocr
21
+
22
+ app = Flask(__name__)
23
+
24
+ # Configure logging
25
+ logging.basicConfig(
26
+ level=logging.INFO,
27
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
28
+ )
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # Configuration
32
+ ALLOWED_EXTENSIONS = {'pdf', 'docx', 'txt', 'csv', 'xlsx', 'xls', 'jpg', 'jpeg', 'png'}
33
+ UPLOAD_FOLDER = tempfile.mkdtemp()
34
+ OUTPUT_FOLDER = os.path.join(os.getcwd(), 'extracted_data')
35
+ os.makedirs(OUTPUT_FOLDER, exist_ok=True)
36
+ app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
37
+ app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB limit
38
+
39
+ # API Key Configuration
40
+ API_KEYS = {
41
+ "your_api_key_1": "client1",
42
+ "your_api_key_2": "client2"
43
+ }
44
+
45
+ # Initialize EasyOCR readers with GPU support
46
+ reader_en_hi = easyocr.Reader(['en', 'hi'], gpu=True)
47
+ reader_en_bn = easyocr.Reader(['en', 'bn'], gpu=True)
48
+ reader_en_ur = easyocr.Reader(['en', 'ur'], gpu=True)
49
+
50
+ def allowed_file(filename):
51
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
52
+
53
+ def validate_api_key():
54
+ """Check if the provided API key is valid"""
55
+ api_key = request.headers.get('X-API-KEY')
56
+ if not api_key or api_key not in API_KEYS:
57
+ return False
58
+ return True
59
+
60
+ def preprocess_image(image):
61
+ """Enhance image for better OCR results"""
62
+ try:
63
+ img = np.array(image)
64
+ if len(img.shape) == 2: # Grayscale
65
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
66
+ elif img.shape[2] == 4: # RGBA
67
+ img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
68
+
69
+ # Convert to grayscale for processing
70
+ gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
71
+
72
+ # Apply adaptive thresholding
73
+ processed = cv2.adaptiveThreshold(
74
+ gray, 255,
75
+ cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
76
+ cv2.THRESH_BINARY, 11, 2
77
+ )
78
+
79
+ return Image.fromarray(processed)
80
+ except Exception as e:
81
+ logger.error(f"Image preprocessing failed: {str(e)}")
82
+ return image
83
+
84
+ def extract_text_from_image(image):
85
+ """Extract text from image using EasyOCR"""
86
+ try:
87
+ processed_img = preprocess_image(image)
88
+ result_en_hi = reader_en_hi.readtext(np.array(processed_img))
89
+ result_en_bn = reader_en_bn.readtext(np.array(processed_img))
90
+ result_en_ur = reader_en_ur.readtext(np.array(processed_img))
91
+
92
+ text_en_hi = " ".join([text[1] for text in result_en_hi])
93
+ text_en_bn = " ".join([text[1] for text in result_en_bn])
94
+ text_en_ur = " ".join([text[1] for text in result_en_ur])
95
+
96
+ return text_en_hi + " " + text_en_bn + " " + text_en_ur
97
+ except Exception as e:
98
+ logger.error(f"OCR extraction failed: {str(e)}")
99
+ return ""
100
+
101
+ def process_pdf_page(page, page_num, pdf_path):
102
+ """Process a single PDF page with mixed content"""
103
+ result = {
104
+ "page": page_num + 1,
105
+ "native_text": "",
106
+ "image_text": "",
107
+ "type": "mixed"
108
+ }
109
+
110
+ # First try to extract native text
111
+ try:
112
+ result["native_text"] = page.extract_text(x_tolerance=1, y_tolerance=1) or ""
113
+ except Exception as e:
114
+ logger.warning(f"Native text extraction failed: {str(e)}")
115
+
116
+ # Check if page has images or if native text extraction was insufficient
117
+ if page.images or len(result["native_text"].strip()) < 50:
118
+ try:
119
+ # Convert the entire page to image
120
+ images = convert_from_path(
121
+ pdf_path,
122
+ first_page=page_num+1,
123
+ last_page=page_num+1,
124
+ dpi=300,
125
+ size=(2480, 3508)) # A4 size at 300dpi
126
+
127
+ if images:
128
+ # Extract text from the full page image
129
+ full_page_text = extract_text_from_image(images[0])
130
+
131
+ # Only use OCR text if we got more content than native extraction
132
+ if len(full_page_text) > len(result["native_text"]):
133
+ result["image_text"] = full_page_text
134
+ result["type"] = "ocr_text" if not result["native_text"] else "mixed"
135
+
136
+ # Explicit cleanup
137
+ del images
138
+ except Exception as e:
139
+ logger.error(f"Page image processing failed: {str(e)}")
140
+
141
+ return result
142
+
143
+ def process_docx(file_path):
144
+ """Extract text from DOCX file"""
145
+ if Document is None:
146
+ raise ImportError("python-docx package is not installed")
147
+
148
+ try:
149
+ doc = Document(file_path)
150
+ text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
151
+ return {
152
+ "content": [{
153
+ "page": 1,
154
+ "text": text,
155
+ "type": "native_text"
156
+ }]
157
+ }
158
+ except Exception as e:
159
+ logger.error(f"DOCX processing failed: {str(e)}")
160
+ raise
161
+
162
+ def process_txt(file_path):
163
+ """Extract text from TXT file"""
164
+ try:
165
+ with open(file_path, 'r', encoding='utf-8') as f:
166
+ text = f.read()
167
+ return {
168
+ "content": [{
169
+ "page": 1,
170
+ "text": text,
171
+ "type": "native_text"
172
+ }]
173
+ }
174
+ except Exception as e:
175
+ logger.error(f"TXT processing failed: {str(e)}")
176
+ raise
177
+
178
+ def process_csv(file_path):
179
+ """Extract data from CSV file"""
180
+ try:
181
+ df = pd.read_csv(file_path)
182
+ text = df.to_string(index=False)
183
+ return {
184
+ "content": [{
185
+ "page": 1,
186
+ "text": text,
187
+ "type": "table_data"
188
+ }]
189
+ }
190
+ except Exception as e:
191
+ logger.error(f"CSV processing failed: {str(e)}")
192
+ raise
193
+
194
+ def process_excel(file_path):
195
+ """Extract data from Excel file (XLSX or XLS)"""
196
+ try:
197
+ text = ""
198
+ if file_path.endswith('.xlsx'):
199
+ wb = openpyxl.load_workbook(file_path)
200
+ for sheet_name in wb.sheetnames:
201
+ sheet = wb[sheet_name]
202
+ text += f"\n\nSheet: {sheet_name}\n"
203
+ for row in sheet.iter_rows(values_only=True):
204
+ text += "\t".join(str(cell) if cell is not None else "" for cell in row) + "\n"
205
+ else: # .xls
206
+ df = pd.read_excel(file_path, sheet_name=None)
207
+ for sheet_name, data in df.items():
208
+ text += f"\n\nSheet: {sheet_name}\n{data.to_string(index=False)}\n"
209
+
210
+ return {
211
+ "content": [{
212
+ "page": 1,
213
+ "text": text,
214
+ "type": "table_data"
215
+ }]
216
+ }
217
+ except Exception as e:
218
+ logger.error(f"Excel processing failed: {str(e)}")
219
+ raise
220
+
221
+ def process_image(file_path):
222
+ """Extract text from image file (JPG, JPEG, PNG)"""
223
+ try:
224
+ image = Image.open(file_path)
225
+ text = extract_text_from_image(image)
226
+ return {
227
+ "content": [{
228
+ "page": 1,
229
+ "text": text,
230
+ "type": "ocr_text"
231
+ }]
232
+ }
233
+ except Exception as e:
234
+ logger.error(f"Image processing failed: {str(e)}")
235
+ raise
236
+
237
+ @app.route('/process', methods=['POST'])
238
+ def handle_file():
239
+ # API Key validation
240
+ if not validate_api_key():
241
+ return jsonify({"error": "Invalid or missing API key"}), 401
242
+
243
+ if 'file' not in request.files:
244
+ return jsonify({"error": "No file provided"}), 400
245
+
246
+ file = request.files['file']
247
+ if not file or file.filename == '':
248
+ return jsonify({"error": "No selected file"}), 400
249
+
250
+ if not allowed_file(file.filename):
251
+ return jsonify({"error": "Invalid file type"}), 400
252
+
253
+ temp_path = None
254
+ try:
255
+ # Save uploaded file temporarily
256
+ filename = secure_filename(file.filename)
257
+ temp_dir = tempfile.mkdtemp()
258
+ temp_path = os.path.join(temp_dir, filename)
259
+ file.save(temp_path)
260
+
261
+ start_time = time.time()
262
+ file_extension = filename.rsplit('.', 1)[1].lower()
263
+
264
+ # Process file based on extension
265
+ if file_extension == 'pdf':
266
+ results = []
267
+ with pdfplumber.open(temp_path) as pdf:
268
+ for page_num, page in enumerate(pdf.pages):
269
+ page_result = process_pdf_page(page, page_num, temp_path)
270
+ results.append(page_result)
271
+
272
+ # Combine results
273
+ combined_text = ""
274
+ for page in results:
275
+ combined_text += page.get("native_text", "") + "\n" + page.get("image_text", "") + "\n"
276
+
277
+ response = {
278
+ "metadata": {
279
+ "filename": filename,
280
+ "pages": len(results),
281
+ "processing_time": round(time.time() - start_time, 2),
282
+ "text_length": len(combined_text)
283
+ },
284
+ "content": results
285
+ }
286
+ elif file_extension == 'docx':
287
+ response = process_docx(temp_path)
288
+ response['metadata'] = {
289
+ "filename": filename,
290
+ "pages": 1,
291
+ "processing_time": round(time.time() - start_time, 2),
292
+ "text_length": len(response['content'][0]['text'])
293
+ }
294
+ elif file_extension == 'txt':
295
+ response = process_txt(temp_path)
296
+ response['metadata'] = {
297
+ "filename": filename,
298
+ "pages": 1,
299
+ "processing_time": round(time.time() - start_time, 2),
300
+ "text_length": len(response['content'][0]['text'])
301
+ }
302
+ elif file_extension == 'csv':
303
+ response = process_csv(temp_path)
304
+ response['metadata'] = {
305
+ "filename": filename,
306
+ "pages": 1,
307
+ "processing_time": round(time.time() - start_time, 2),
308
+ "text_length": len(response['content'][0]['text'])
309
+ }
310
+ elif file_extension in ('xlsx', 'xls'):
311
+ response = process_excel(temp_path)
312
+ response['metadata'] = {
313
+ "filename": filename,
314
+ "pages": 1,
315
+ "processing_time": round(time.time() - start_time, 2),
316
+ "text_length": len(response['content'][0]['text'])
317
+ }
318
+ elif file_extension in ('jpg', 'jpeg', 'png'):
319
+ response = process_image(temp_path)
320
+ response['metadata'] = {
321
+ "filename": filename,
322
+ "pages": 1,
323
+ "processing_time": round(time.time() - start_time, 2),
324
+ "text_length": len(response['content'][0]['text'])
325
+ }
326
+ else:
327
+ return jsonify({"error": "Unsupported file type"}), 400
328
+
329
+ return jsonify(response)
330
+
331
+ except Exception as e:
332
+ logger.error(f"Processing failed: {str(e)}")
333
+ return jsonify({"error": str(e)}), 500
334
+
335
+ finally:
336
+ # Clean up temporary files
337
+ try:
338
+ if temp_path and os.path.exists(temp_path):
339
+ os.remove(temp_path)
340
+ if 'temp_dir' in locals() and os.path.exists(temp_dir):
341
+ os.rmdir(temp_dir)
342
+ except Exception as e:
343
+ logger.error(f"Cleanup failed: {str(e)}")
344
+
345
+ if __name__ == '__main__':
346
+ app.run(host='0.0.0.0', port=5000, debug=True)
curl_2.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ curl -X POST \
2
+ http://localhost:5000/process \
3
+ -H 'X-API-KEY: your_api_key_1' \
4
+ -F 'file=@/path/to/your/file.pdf'
install_dependencies.sh ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # =============================================
4
+ # INSTALL ALL DEPENDENCIES FOR DOCUMENT PROCESSING
5
+ # =============================================
6
+
7
+ # 1. Install system dependencies (OCR, PDF, OpenCV)
8
+ sudo apt-get update && sudo apt-get install -y \
9
+ tesseract-ocr \
10
+ tesseract-ocr-eng \
11
+ tesseract-ocr-ben \
12
+ tesseract-ocr-hin \
13
+ tesseract-ocr-urd \
14
+ poppler-utils \
15
+ libsm6 \
16
+ libxext6 \
17
+ libxrender-dev \
18
+ libzbar0 \
19
+ antiword \
20
+ unrtf
21
+
22
+ # 2. Install Python packages
23
+ pip install --upgrade \
24
+ flask \
25
+ pdfplumber \
26
+ pdf2image \
27
+ pillow \
28
+ pytesseract \
29
+ opencv-python-headless \
30
+ numpy \
31
+ pandas \
32
+ python-docx \
33
+ openpyxl \
34
+ waitress \
35
+ flask-httpauth \
36
+ flask-cors \
37
+ easyocr \
38
+ torch \
39
+ pyzbar \
40
+ textract \
41
+ transformers \
42
+ pdfminer.six
43
+
44
+ echo "✅ All dependencies installed successfully!"
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Flask==2.2.5
2
+ flask-httpauth==4.7.0
3
+ pdfplumber==0.10.3
4
+ pdf2image==1.16.3
5
+ Pillow==9.5.0
6
+ opencv-python-headless==4.8.0.76
7
+ numpy==1.24.4
8
+ pandas==1.5.3
9
+ python-docx==0.8.11
10
+ openpyxl==3.1.2
11
+ easyocr==1.7.1
12
+ torch==2.0.1
13
+ torchvision==0.15.2
14
+ scikit-image==0.21.0
15
+ matplotlib==3.7.1
16
+ tqdm==4.65.0
17
+ PyMuPDF==1.22.0