alakxender commited on
Commit
228e8c1
·
0 Parent(s):
Files changed (14) hide show
  1. .gitattributes +36 -0
  2. .gitignore +2 -0
  3. README.md +13 -0
  4. app.py +423 -0
  5. detector.py +141 -0
  6. example.pdf +3 -0
  7. hw_1_sl.png +0 -0
  8. hw_2_sl.jpg +0 -0
  9. hw_3_sl.png +0 -0
  10. hw_4_sl.png +0 -0
  11. ml.png +0 -0
  12. requirements.txt +3 -0
  13. type_1_sl.png +0 -0
  14. type_2_sl.png +0 -0
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__
2
+ output
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Dhivehi Ocr
3
+ emoji: 📝
4
+ colorFrom: gray
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 5.25.2
8
+ app_file: app.py
9
+ pinned: false
10
+ short_description: Thaana text-to-image, ocr
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ import os
4
+ import sys
5
+ import subprocess
6
+ from PIL import Image, ImageDraw
7
+ from detector import TextDetector
8
+ import tempfile
9
+ import shutil
10
+ import json
11
+ from datetime import datetime
12
+
13
+ # List of available models with their IDs and prompts
14
+ MODELS = {
15
+ "Medium-14k, Single Line": { # /lab/mx01/md/sl-14/ft/
16
+ "id": "alakxender/paligemma2-qlora-dhivehi-ocr-224-sl-14k",
17
+ "prompt": "What text is written in this image?"
18
+ },
19
+ "Medium-16k, Single Line": { # /lab/mx01/md/sl-16/ft/
20
+ "id": "alakxender/paligemma2-qlora-dhivehi-ocr-224-sl-md-16k",
21
+ "prompt": "What text is written in this image?"
22
+ },
23
+ "Small, Single Line": { # /lab/mx01/sm/sl/ft/
24
+ "id": "alakxender/paligemma2-qlora-vrd-dhivehi-ocr-224-sm",
25
+ "prompt": "What text is written in this image?"
26
+ }
27
+ }
28
+ """ "Full Text": { # /lab/mx01/pr/sl/ft/
29
+ "id": "alakxender/paligemma2-qlora-dhivehi-ocr-224-mx01",
30
+ "prompt": "What text is written in this image?",
31
+ } ,
32
+ Full Text": { # /lab/mx01/pr/sl/ft/
33
+ "id": "alakxender/paligemma2-qlora-dhivehi-ocr-448-mx01",
34
+ "prompt": "OCR",
35
+ }
36
+ ,
37
+ Final": { # /lab/mx01/pr/sl/ft-final/
38
+ "id": "alakxender/paligemma2-dhivehi-ocr-448-mx01-final",
39
+ "prompt": "OCR", # smaller the better: 3k vrd, 3k printed, 3k handwritten, 1k single line
40
+ }"""
41
+ # Global model state
42
+ model = None
43
+ processor = None
44
+ current_model_name = None
45
+ detector = TextDetector()
46
+
47
+ def load_model(model_name):
48
+ """Load the model and processor"""
49
+ global model, processor, current_model_name
50
+
51
+ model_id = MODELS[model_name]['id']
52
+
53
+ # Load the PEFT configuration to get the base model path
54
+ peft_config = PeftConfig.from_pretrained(model_id)
55
+
56
+ # Load the base model
57
+ base_model = PaliGemmaForConditionalGeneration.from_pretrained(
58
+ peft_config.base_model_name_or_path,
59
+ device_map="auto",
60
+ torch_dtype=torch.bfloat16
61
+ )
62
+
63
+ # Load the adapter on top of the base model
64
+ model = PeftModel.from_pretrained(base_model, model_id)
65
+ processor = AutoProcessor.from_pretrained(peft_config.base_model_name_or_path)
66
+ current_model_name = model_name
67
+
68
+ def process_single_line(image, model_name):
69
+ """Process a single line of text"""
70
+ prompt = MODELS[model_name]["prompt"]
71
+ # Add image token to prompt
72
+ prompt = f"<image>{prompt}"
73
+ model_inputs = processor(text=prompt, images=image, return_tensors="pt").to(torch.bfloat16).to("cuda")
74
+
75
+ outputs = model.generate(
76
+ **model_inputs,
77
+ max_new_tokens=500,
78
+ do_sample=False
79
+ )
80
+
81
+ generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
82
+ # Remove the prompt and any leading/trailing whitespace
83
+ cleaned_text = generated_text.replace(prompt, "").strip()
84
+ # Remove any remaining question marks or other artifacts
85
+ cleaned_text = cleaned_text.lstrip("?").strip()
86
+ # Remove the prompt text if it somehow appears in the output
87
+ cleaned_text = cleaned_text.replace("What text is written in this image?", "").strip()
88
+ return cleaned_text
89
+
90
+ def draw_bboxes(image, text_lines):
91
+ """Draw bounding boxes on the image"""
92
+ draw = ImageDraw.Draw(image)
93
+ for line in text_lines:
94
+ # Draw polygon - flatten nested coordinates
95
+ polygon = line['polygon']
96
+ flat_polygon = [coord for point in polygon for coord in point]
97
+ draw.polygon(flat_polygon, outline="red", width=2)
98
+
99
+ # Draw bbox
100
+ x1, y1, x2, y2 = line['bbox']
101
+ draw.rectangle([x1, y1, x2, y2], outline="blue", width=1)
102
+
103
+ # Draw confidence score
104
+ draw.text((x1, y1 - 10), f"{line['confidence']:.2f}", fill="red")
105
+ return image
106
+
107
+ def process_multi_line(image, model_name, progress=gr.Progress()):
108
+ """Process a multi-line image by detecting text regions and OCRing each region"""
109
+ # Create temporary directory
110
+ with tempfile.TemporaryDirectory() as temp_dir:
111
+ # Save input image
112
+ input_path = os.path.join(temp_dir, "input.png")
113
+ image.save(input_path)
114
+
115
+ # Initialize detector with temp directory
116
+ detector = TextDetector(output_dir=temp_dir)
117
+
118
+ # Run text detection
119
+ progress(0.1, desc="Detecting text regions...")
120
+ results = detector.process_input(input_path, save_images=True)
121
+
122
+ # Get text regions for the image
123
+ regions = detector.get_text_regions(results, "input")
124
+ if not regions:
125
+ return "No text regions detected", []
126
+
127
+ # Process each text region
128
+ page_regions = regions[0] # First page
129
+ text_lines = page_regions.get('bboxes', [])
130
+
131
+ if not text_lines:
132
+ return "No text lines detected", []
133
+
134
+ # Sort text lines by y-coordinate (top to bottom)
135
+ text_lines.sort(key=lambda x: x['bbox'][1])
136
+
137
+ # Draw bounding boxes on the image
138
+ bbox_image = image.copy()
139
+ bbox_image = draw_bboxes(bbox_image, text_lines)
140
+
141
+ # Process each text line
142
+ all_text = []
143
+ total_lines = len(text_lines)
144
+
145
+ for i, line in enumerate(text_lines):
146
+ progress(0.2 + (i/total_lines)*0.8, desc=f"Processing line {i+1}/{total_lines}...")
147
+
148
+ # Extract text region using bbox
149
+ x1, y1, x2, y2 = line['bbox']
150
+ line_image = image.crop((x1, y1, x2, y2))
151
+
152
+ # Process the line
153
+ line_text = process_single_line(line_image, model_name)
154
+ all_text.append(line_text)
155
+
156
+ progress(1.0, desc="Done!")
157
+ return "\n".join(all_text), [bbox_image] # Return as list for gallery
158
+
159
+ def process_pdf(pdf_path, model_name, progress=gr.Progress()):
160
+ """Process a PDF file"""
161
+ # Create temporary directory
162
+ with tempfile.TemporaryDirectory() as temp_dir:
163
+ # Initialize detector with temp directory
164
+ detector = TextDetector(output_dir=temp_dir)
165
+
166
+ # Run text detection on PDF (process first 2 pages)
167
+ progress(0.1, desc="Detecting text regions in PDF...")
168
+ results = detector.process_input(pdf_path, save_images=True, page_range="0,1")
169
+
170
+ # Get text regions for the PDF
171
+ regions = detector.get_text_regions(results, os.path.splitext(os.path.basename(pdf_path))[0])
172
+ if not regions:
173
+ return "No text regions detected", []
174
+
175
+ # Process each page
176
+ all_text = []
177
+ bbox_images = []
178
+
179
+ # Get the base name of the PDF without extension
180
+ pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
181
+
182
+ for page_num, page_regions in enumerate(regions):
183
+ progress(0.2 + (page_num/2)*0.3, desc=f"Processing page {page_num+1}...")
184
+
185
+ # Try different possible paths for the page image
186
+ possible_paths = [
187
+ os.path.join(temp_dir, pdf_name, f"{pdf_name}_{page_num}_bbox.png"), # Detector's actual path
188
+ os.path.join(temp_dir, pdf_name, f"page_{page_num}.png"), # Original path
189
+ os.path.join(temp_dir, f"page_{page_num}.png"), # Direct in output dir
190
+ os.path.join(temp_dir, f"{pdf_name}_page_{page_num}.png") # Alternative naming
191
+ ]
192
+
193
+ page_image = None
194
+ for page_image_path in possible_paths:
195
+ if os.path.exists(page_image_path):
196
+ page_image = Image.open(page_image_path)
197
+ break
198
+
199
+ if page_image is None:
200
+ all_text.append(f"\nPage {page_num+1}: Page image not found. Tried paths:\n" +
201
+ "\n".join(f"- {path}" for path in possible_paths))
202
+ continue
203
+
204
+ text_lines = page_regions.get('bboxes', [])
205
+ if not text_lines:
206
+ all_text.append(f"\nPage {page_num+1}: No text lines detected")
207
+ continue
208
+
209
+ # Sort text lines by y-coordinate (top to bottom)
210
+ text_lines.sort(key=lambda x: x['bbox'][1])
211
+
212
+ # Draw bounding boxes on the image
213
+ bbox_image = page_image.copy()
214
+ bbox_image = draw_bboxes(bbox_image, text_lines)
215
+ bbox_images.append(bbox_image)
216
+
217
+ # Process each text line
218
+ page_text = []
219
+ total_lines = len(text_lines)
220
+
221
+ for i, line in enumerate(text_lines):
222
+ progress(0.5 + (page_num/2)*0.2 + (i/total_lines)*0.3,
223
+ desc=f"Processing line {i+1}/{total_lines} on page {page_num+1}...")
224
+
225
+ # Extract text region using bbox
226
+ x1, y1, x2, y2 = line['bbox']
227
+ line_image = page_image.crop((x1, y1, x2, y2))
228
+
229
+ # Process the line
230
+ line_text = process_single_line(line_image, model_name)
231
+ page_text.append(line_text)
232
+
233
+ # Add page text without page number
234
+ all_text.extend(page_text)
235
+
236
+ progress(1.0, desc="Done!")
237
+ return "\n".join(all_text), bbox_images # Return list of bbox images
238
+
239
+ @spaces.GPU
240
+ def process_image(model_name, image, progress=gr.Progress()):
241
+ """Process a single image"""
242
+ if image is None:
243
+ return "", None
244
+
245
+ # Load model if different model selected
246
+ if model_name != current_model_name:
247
+ progress(0, desc="Loading model...")
248
+ load_model(model_name)
249
+
250
+ return process_multi_line(image, model_name, progress)
251
+
252
+ # Example images with descriptions
253
+ examples = [
254
+ ["type_1_sl.png", "Typed Dhivehi text sample 1"],
255
+ ["type_2_sl.png", "Typed Dhivehi text sample 2"],
256
+ ["hw_1_sl.png", "Handwritten Dhivehi text sample 1"], # exp this
257
+ ["hw_2_sl.jpg", "Handwritten Dhivehi text sample 2"], # exp val3
258
+ ["hw_3_sl.png", "Handwritten Dhivehi text sample 3"], # exp val2
259
+ ["hw_4_sl.png", "Handwritten Dhivehi text sample 4"], # exp val1
260
+ ["ml.png", "Multi-line Dhivehi text sample"]
261
+ ]
262
+
263
+ css = """
264
+ .textbox1 textarea {
265
+ font-size: 18px !important;
266
+ font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma' !important;
267
+ line-height: 1.8 !important;
268
+ }
269
+ .textbox2 textarea {
270
+ display: none;
271
+ }
272
+ """
273
+
274
+ with gr.Blocks(title="Dhivehi OCR",css=css) as demo:
275
+ gr.Markdown("# Dhivehi OCR")
276
+ gr.Markdown("Thaana OCR experimental finetunes")
277
+
278
+ with gr.Row():
279
+ model_dropdown = gr.Dropdown(
280
+ choices=list(MODELS.keys()),
281
+ value=list(MODELS.keys())[0], # Default to first model
282
+ label="Select Model"
283
+ )
284
+
285
+ with gr.Tabs():
286
+ with gr.Tab("Image Input"):
287
+ with gr.Row():
288
+ with gr.Column(scale=2):
289
+ image_input = gr.Image(type="pil", label="Input Image")
290
+ image_submit_btn = gr.Button("Extract Text")
291
+
292
+ # Image examples
293
+ gr.Examples(
294
+ examples=[[img] for img, _ in examples],
295
+ inputs=[image_input],
296
+ label="Example Images",
297
+ examples_per_page=8
298
+ )
299
+
300
+ with gr.Column(scale=3):
301
+ with gr.Tabs():
302
+ with gr.Tab("Extracted Text"):
303
+ image_text_output = gr.Textbox(
304
+ lines=5,
305
+ label="Extracted Text",
306
+ show_copy_button=True,
307
+ rtl=True,
308
+ elem_classes="textbox1"
309
+ )
310
+
311
+ with gr.Tab("Detected Text Regions"):
312
+ image_bbox_output = gr.Gallery(
313
+ label="Detected Text Regions",
314
+ show_label=True,
315
+ columns=2
316
+ )
317
+
318
+ with gr.Tab("PDF Input"):
319
+ with gr.Row():
320
+ with gr.Column(scale=2):
321
+ pdf_input = gr.File(
322
+ label="Input PDF",
323
+ file_types=[".pdf"]
324
+ )
325
+ pdf_submit_btn = gr.Button("Extract Text")
326
+
327
+ # PDF examples
328
+ gr.Examples(
329
+ examples=[
330
+ ["example.pdf", "Example 1"],
331
+ ], # Add PDF examples here if needed
332
+ inputs=[pdf_input],
333
+ label="Example PDFs",
334
+ examples_per_page=8
335
+ )
336
+
337
+ with gr.Column(scale=3):
338
+ with gr.Tabs():
339
+ with gr.Tab("Extracted Text"):
340
+ pdf_text_output = gr.Textbox(
341
+ lines=5,
342
+ label="Extracted Text",
343
+ show_copy_button=True,
344
+ rtl=True,
345
+ elem_classes="textbox1"
346
+ )
347
+
348
+ with gr.Tab("Detected Text Regions"):
349
+ pdf_bbox_output = gr.Gallery(
350
+ label="Detected Text Regions",
351
+ show_label=True,
352
+ columns=2
353
+ )
354
+
355
+ # Process image when button is clicked
356
+ image_submit_btn.click(
357
+ fn=process_image,
358
+ inputs=[model_dropdown, image_input],
359
+ outputs=[image_text_output, image_bbox_output]
360
+ )
361
+
362
+ # Process PDF when button is clicked
363
+ pdf_submit_btn.click(
364
+ fn=process_pdf,
365
+ inputs=[pdf_input, model_dropdown],
366
+ outputs=[pdf_text_output, pdf_bbox_output]
367
+ )
368
+
369
+ # Add experimental note at the bottom
370
+ gr.Markdown("""
371
+ ---
372
+ **Note:** This is an experimental proof of concept (POC) for Dhivehi OCR.
373
+ """)
374
+
375
+ # Function to install requirements
376
+ def install_requirements():
377
+ requirements_path = 'requirements.txt'
378
+
379
+ # Check if requirements.txt exists
380
+ if not os.path.exists(requirements_path):
381
+ print("Error: requirements.txt not found")
382
+ return False
383
+
384
+ try:
385
+ print("Installing requirements...")
386
+ # Using --no-cache-dir to avoid memory issues
387
+ subprocess.check_call([
388
+ sys.executable,
389
+ "-m",
390
+ "pip",
391
+ "install",
392
+ "-r",
393
+ requirements_path,
394
+ "--no-cache-dir"
395
+ ])
396
+ print("Successfully installed all requirements")
397
+ return True
398
+ except subprocess.CalledProcessError as e:
399
+ print(f"Error installing requirements: {e}")
400
+ return False
401
+ except Exception as e:
402
+ print(f"Unexpected error: {e}")
403
+ return False
404
+
405
+ # Launch the app
406
+ if __name__ == "__main__":
407
+ # First install requirements
408
+ success = install_requirements()
409
+ if success:
410
+ print("All requirements installed successfully")
411
+
412
+ from transformers.image_utils import load_image
413
+ import torch
414
+ from transformers import PaliGemmaForConditionalGeneration, AutoProcessor
415
+ from peft import PeftModel, PeftConfig
416
+
417
+ # Load the first model by default
418
+ load_model(list(MODELS.keys())[0])
419
+
420
+ demo.launch(server_name="0.0.0.0", server_port=7812)
421
+ #demo.launch()
422
+ else:
423
+ print("Failed to install some requirements")
detector.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import subprocess
4
+ from typing import Union, List, Dict, Optional
5
+ from pathlib import Path
6
+
7
+ class TextDetector:
8
+ def __init__(self, output_dir: Optional[str] = None):
9
+ """
10
+ Initialize the text detector.
11
+
12
+ Args:
13
+ output_dir: Optional directory to save results. If None, uses default surya_detect output directory.
14
+ """
15
+ self.output_dir = output_dir
16
+
17
+ def process_input(self,
18
+ data_path: Union[str, Path],
19
+ save_images: bool = False,
20
+ page_range: Optional[str] = None) -> Dict:
21
+ """
22
+ Process input file or directory using surya_detect.
23
+
24
+ Args:
25
+ data_path: Path to image, PDF, or directory of images/PDFs
26
+ save_images: Whether to save images of pages and detected text lines
27
+ page_range: Optional page range to process in PDFs (e.g., "0,5-10,20")
28
+
29
+ Returns:
30
+ Dictionary containing detection results
31
+ """
32
+ # Convert to Path object if string
33
+ data_path = Path(data_path)
34
+
35
+ # Build surya_detect command
36
+ cmd = ["surya_detect", str(data_path)]
37
+
38
+ if save_images:
39
+ cmd.append("--images")
40
+
41
+ if self.output_dir:
42
+ cmd.extend(["--output_dir", self.output_dir])
43
+
44
+ if page_range:
45
+ cmd.extend(["--page_range", page_range])
46
+
47
+ # Run surya_detect
48
+ try:
49
+ subprocess.run(cmd, check=True)
50
+ except subprocess.CalledProcessError as e:
51
+ raise RuntimeError(f"Error running surya_detect: {e}")
52
+
53
+ # Read and return results
54
+ return self._read_results(data_path)
55
+
56
+ def _read_results(self, data_path: Path) -> Dict:
57
+ """
58
+ Read and parse the results.json file generated by surya_detect.
59
+
60
+ Args:
61
+ data_path: Path to the input file/directory
62
+
63
+ Returns:
64
+ Dictionary containing detection results
65
+ """
66
+ # Determine results file path
67
+ if self.output_dir:
68
+ # surya_detect creates a subdirectory with the input filename
69
+ input_name = data_path.stem
70
+ results_path = Path(self.output_dir) / input_name / "results.json"
71
+ else:
72
+ # Default surya_detect output location
73
+ results_path = data_path.parent / "results.json"
74
+
75
+ if not results_path.exists():
76
+ raise FileNotFoundError(f"Results file not found at {results_path}")
77
+
78
+ # Read and parse results
79
+ with open(results_path, 'r') as f:
80
+ results = json.load(f)
81
+
82
+ return results
83
+
84
+ def get_text_regions(self, results: Dict, filename: str) -> List[Dict]:
85
+ """
86
+ Extract text regions from detection results for a specific file.
87
+
88
+ Args:
89
+ results: Detection results dictionary
90
+ filename: Name of the file to get regions for (without extension)
91
+
92
+ Returns:
93
+ List of dictionaries containing text regions for each page
94
+ """
95
+ if filename not in results:
96
+ raise KeyError(f"No results found for file {filename}")
97
+
98
+ return results[filename]
99
+
100
+ def get_page_regions(self, results: Dict, filename: str, page_num: int) -> Dict:
101
+ """
102
+ Get text regions for a specific page of a file.
103
+
104
+ Args:
105
+ results: Detection results dictionary
106
+ filename: Name of the file (without extension)
107
+ page_num: Page number (0-based)
108
+
109
+ Returns:
110
+ Dictionary containing text regions for the specified page
111
+ """
112
+ regions = self.get_text_regions(results, filename)
113
+
114
+ if page_num >= len(regions):
115
+ raise IndexError(f"Page {page_num} not found in results")
116
+
117
+ return regions[page_num]
118
+
119
+ def get_text_lines(self, page_regions: Dict) -> List[Dict]:
120
+ """
121
+ Extract text lines from page regions.
122
+
123
+ Args:
124
+ page_regions: Dictionary containing page detection results
125
+
126
+ Returns:
127
+ List of dictionaries containing text line information
128
+ """
129
+ return page_regions.get('bboxes', [])
130
+
131
+ def get_vertical_lines(self, page_regions: Dict) -> List[Dict]:
132
+ """
133
+ Extract vertical lines from page regions.
134
+
135
+ Args:
136
+ page_regions: Dictionary containing page detection results
137
+
138
+ Returns:
139
+ List of dictionaries containing vertical line information
140
+ """
141
+ return page_regions.get('vertical_lines', [])
example.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:525262d9de0efaf83a0c8559d58d6f11f13dec1c319ff10a70a463047fa5ff80
3
+ size 100352
hw_1_sl.png ADDED
hw_2_sl.jpg ADDED
hw_3_sl.png ADDED
hw_4_sl.png ADDED
ml.png ADDED
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers
2
+ peft
3
+ surya-ocr==0.13.1
type_1_sl.png ADDED
type_2_sl.png ADDED