alakxender commited on
Commit
97bb8f1
·
1 Parent(s): edd2900
Files changed (3) hide show
  1. app.py +196 -355
  2. gemma.py +304 -0
  3. paligemma2.py +315 -0
app.py CHANGED
@@ -3,288 +3,42 @@ import gradio as gr
3
  import os
4
  import sys
5
  import subprocess
6
- from PIL import Image, ImageDraw
7
- from detector import TextDetector
8
- import tempfile
9
- import shutil
10
- import json
11
- from datetime import datetime
12
  import numpy as np
 
 
13
 
14
- # List of available models with their IDs and prompts
15
- MODELS = {
16
- "Medium-14k, Single Line": { # /lab/mx01/md/sl-14/ft/
17
- "id": "alakxender/paligemma2-qlora-dhivehi-ocr-224-sl-14k",
18
- "prompt": "What text is written in this image?"
19
- },
20
- "Medium-16k, Single Line": { # /lab/mx01/md/sl-16/ft/
21
- "id": "alakxender/paligemma2-qlora-dhivehi-ocr-224-sl-md-16k",
22
- "prompt": "What text is written in this image?"
23
- },
24
- "Small, Single Line": { # /lab/mx01/sm/sl/ft/
25
- "id": "alakxender/paligemma2-qlora-vrd-dhivehi-ocr-224-sm",
26
- "prompt": "What text is written in this image?"
27
- }
28
- }
29
- """ "Full Text": { # /lab/mx01/pr/sl/ft/
30
- "id": "alakxender/paligemma2-qlora-dhivehi-ocr-224-mx01",
31
- "prompt": "What text is written in this image?",
32
- } ,
33
- Full Text": { # /lab/mx01/pr/sl/ft/
34
- "id": "alakxender/paligemma2-qlora-dhivehi-ocr-448-mx01",
35
- "prompt": "OCR",
36
- }
37
- ,
38
- Final": { # /lab/mx01/pr/sl/ft-final/
39
- "id": "alakxender/paligemma2-dhivehi-ocr-448-mx01-final",
40
- "prompt": "OCR", # smaller the better: 3k vrd, 3k printed, 3k handwritten, 1k single line
41
- }"""
42
- # Global model state
43
- model = None
44
- processor = None
45
- current_model_name = None
46
- detector = TextDetector()
47
-
48
- def load_model(model_name):
49
- """Load the model and processor"""
50
- global model, processor, current_model_name
51
-
52
- model_id = MODELS[model_name]['id']
53
-
54
- # Load the PEFT configuration to get the base model path
55
- peft_config = PeftConfig.from_pretrained(model_id)
56
-
57
- # Load the base model
58
- base_model = PaliGemmaForConditionalGeneration.from_pretrained(
59
- peft_config.base_model_name_or_path,
60
- device_map="auto",
61
- torch_dtype=torch.bfloat16
62
- )
63
-
64
- # Load the adapter on top of the base model
65
- model = PeftModel.from_pretrained(base_model, model_id)
66
- processor = AutoProcessor.from_pretrained(peft_config.base_model_name_or_path)
67
- current_model_name = model_name
68
-
69
- def process_single_line(image, model_name):
70
- """Process a single line of text"""
71
- prompt = MODELS[model_name]["prompt"]
72
- # Add image token to prompt
73
- prompt = f"<image>{prompt}"
74
-
75
- # First prepare inputs without moving to CUDA
76
- model_inputs = processor(text=prompt, images=image, return_tensors="pt")
77
-
78
- # Then move to CUDA and convert only image tensors to bfloat16
79
- for k, v in model_inputs.items():
80
- if k == "pixel_values":
81
- model_inputs[k] = v.to(torch.bfloat16).to("cuda")
82
- else:
83
- model_inputs[k] = v.to("cuda")
84
-
85
- outputs = model.generate(
86
- **model_inputs,
87
- max_new_tokens=500,
88
- do_sample=False
89
- )
90
-
91
- generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
92
- # Remove the prompt and any leading/trailing whitespace
93
- cleaned_text = generated_text.replace(prompt, "").strip()
94
- # Remove any remaining question marks or other artifacts
95
- cleaned_text = cleaned_text.lstrip("?").strip()
96
- # Remove the prompt text if it somehow appears in the output
97
- cleaned_text = cleaned_text.replace("What text is written in this image?", "").strip()
98
- return cleaned_text
99
-
100
- def draw_bboxes(image, text_lines):
101
- """Draw bounding boxes on the image"""
102
- draw = ImageDraw.Draw(image)
103
- for line in text_lines:
104
- # Draw polygon - flatten nested coordinates
105
- polygon = line['polygon']
106
- flat_polygon = [coord for point in polygon for coord in point]
107
- draw.polygon(flat_polygon, outline="red", width=2)
108
-
109
- # Draw bbox
110
- x1, y1, x2, y2 = line['bbox']
111
- draw.rectangle([x1, y1, x2, y2], outline="blue", width=1)
112
-
113
- # Draw confidence score
114
- draw.text((x1, y1 - 10), f"{line['confidence']:.2f}", fill="red")
115
- return image
116
-
117
- def process_multi_line(image, model_name, progress=gr.Progress()):
118
- """Process a multi-line image by detecting text regions and OCRing each region"""
119
- # Create temporary directory
120
- with tempfile.TemporaryDirectory() as temp_dir:
121
- # Save input image
122
- input_path = os.path.join(temp_dir, "input.png")
123
- image.save(input_path)
124
-
125
- # Initialize detector with temp directory
126
- detector = TextDetector(output_dir=temp_dir)
127
-
128
- # Run text detection
129
- progress(0.1, desc="Detecting text regions...")
130
- results = detector.process_input(input_path, save_images=True)
131
-
132
- # Get text regions for the image
133
- regions = detector.get_text_regions(results, "input")
134
- if not regions:
135
- return "No text regions detected", []
136
-
137
- # Process each text region
138
- page_regions = regions[0] # First page
139
- text_lines = page_regions.get('bboxes', [])
140
-
141
- if not text_lines:
142
- return "No text lines detected", []
143
-
144
- # Sort text lines by y-coordinate (top to bottom)
145
- text_lines.sort(key=lambda x: x['bbox'][1])
146
-
147
- # Draw bounding boxes on the image
148
- bbox_image = image.copy()
149
- bbox_image = draw_bboxes(bbox_image, text_lines)
150
-
151
- # Process each text line
152
- all_text = []
153
- total_lines = len(text_lines)
154
-
155
- for i, line in enumerate(text_lines):
156
- progress(0.2 + (i/total_lines)*0.8, desc=f"Processing line {i+1}/{total_lines}...")
157
-
158
- # Extract text region using bbox
159
- x1, y1, x2, y2 = line['bbox']
160
- line_image = image.crop((x1, y1, x2, y2))
161
-
162
- # Process the line
163
- line_text = process_single_line(line_image, model_name)
164
- all_text.append(line_text)
165
-
166
- progress(1.0, desc="Done!")
167
- return "\n".join(all_text), [bbox_image] # Return as list for gallery
168
 
169
  @spaces.GPU
170
- def process_image(model_name, image, progress=gr.Progress()):
171
- """Process a single image"""
172
- if image is None:
173
- return "", []
174
-
175
- # Load model if different model selected
176
- if model_name != current_model_name:
177
- progress(0, desc="Loading model...")
178
- load_model(model_name)
179
-
180
- if isinstance(image, np.ndarray):
181
- image = Image.fromarray(image)
182
 
183
- width, height = image.size
184
- print(f"Image dimensions: {width}x{height}")
 
 
185
 
186
- if height > 50:
187
- return process_multi_line(image, model_name, progress)
188
- else:
189
- return process_single_line(image, model_name), [image]
190
 
191
  @spaces.GPU
192
- def process_pdf(pdf_path, model_name, progress=gr.Progress()):
193
- """Process a PDF file"""
194
- if pdf_path is None:
195
- return "", []
196
-
197
- # Load model if different model selected
198
- if model_name != current_model_name:
199
- progress(0, desc="Loading model...")
200
- load_model(model_name)
201
-
202
- # Create temporary directory
203
- with tempfile.TemporaryDirectory() as temp_dir:
204
- # Initialize detector with temp directory
205
- detector = TextDetector(output_dir=temp_dir)
206
-
207
- # Run text detection on PDF (process first 2 pages)
208
- progress(0.1, desc="Detecting text regions in PDF...")
209
- results = detector.process_input(pdf_path, save_images=True, page_range="0")
210
-
211
- # Get text regions for the PDF
212
- regions = detector.get_text_regions(results, os.path.splitext(os.path.basename(pdf_path))[0])
213
- if not regions:
214
- return "No text regions detected", []
215
-
216
- # Process each page
217
- all_text = []
218
- bbox_images = []
219
-
220
- # Get the base name of the PDF without extension
221
- pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
222
-
223
- for page_num, page_regions in enumerate(regions):
224
- progress(0.2 + (page_num/2)*0.3, desc=f"Processing page {page_num+1}...")
225
-
226
- # Try different possible paths for the page image
227
- possible_paths = [
228
- os.path.join(temp_dir, pdf_name, f"{pdf_name}_{page_num}_bbox.png"), # Detector's actual path
229
- os.path.join(temp_dir, pdf_name, f"page_{page_num}.png"), # Original path
230
- os.path.join(temp_dir, f"page_{page_num}.png"), # Direct in output dir
231
- os.path.join(temp_dir, f"{pdf_name}_page_{page_num}.png") # Alternative naming
232
- ]
233
-
234
- page_image = None
235
- for page_image_path in possible_paths:
236
- if os.path.exists(page_image_path):
237
- page_image = Image.open(page_image_path)
238
- break
239
-
240
- if page_image is None:
241
- all_text.append(f"\nPage {page_num+1}: Page image not found. Tried paths:\n" +
242
- "\n".join(f"- {path}" for path in possible_paths))
243
- continue
244
-
245
- text_lines = page_regions.get('bboxes', [])
246
- if not text_lines:
247
- all_text.append(f"\nPage {page_num+1}: No text lines detected")
248
- continue
249
-
250
- # Sort text lines by y-coordinate (top to bottom)
251
- text_lines.sort(key=lambda x: x['bbox'][1])
252
-
253
- # Draw bounding boxes on the image
254
- bbox_image = page_image.copy()
255
- bbox_image = draw_bboxes(bbox_image, text_lines)
256
- bbox_images.append(bbox_image)
257
-
258
- # Process each text line
259
- page_text = []
260
- total_lines = len(text_lines)
261
-
262
- for i, line in enumerate(text_lines):
263
- progress(0.5 + (page_num/2)*0.2 + (i/total_lines)*0.3,
264
- desc=f"Processing line {i+1}/{total_lines} on page {page_num+1}...")
265
-
266
- # Extract text region using bbox
267
- x1, y1, x2, y2 = line['bbox']
268
- line_image = page_image.crop((x1, y1, x2, y2))
269
-
270
- # Process the line
271
- line_text = process_single_line(line_image, model_name)
272
- page_text.append(line_text)
273
-
274
- # Add page text without page number
275
- all_text.extend(page_text)
276
-
277
- progress(1.0, desc="Done!")
278
- return "\n".join(all_text), bbox_images # Return list of bbox images
279
 
280
  # Example images with descriptions
281
  examples = [
282
  ["type_1_sl.png", "Typed Dhivehi text sample 1"],
283
  ["type_2_sl.png", "Typed Dhivehi text sample 2"],
284
- ["hw_1_sl.png", "Handwritten Dhivehi text sample 1"], # exp this
285
- ["hw_2_sl.jpg", "Handwritten Dhivehi text sample 2"], # exp val3
286
- ["hw_3_sl.png", "Handwritten Dhivehi text sample 3"], # exp val2
287
- ["hw_4_sl.png", "Handwritten Dhivehi text sample 4"], # exp val1
288
  ["ml.png", "Multi-line Dhivehi text sample"]
289
  ]
290
 
@@ -299,101 +53,190 @@ css = """
299
  }
300
  """
301
 
302
- with gr.Blocks(title="Dhivehi OCR",css=css) as demo:
303
- gr.Markdown("# Dhivehi OCR")
304
- gr.Markdown("Thaana OCR experimental finetunes")
305
-
306
- with gr.Row():
307
- model_dropdown = gr.Dropdown(
308
- choices=list(MODELS.keys()),
309
- value=list(MODELS.keys())[0], # Default to first model
310
- label="Select Model"
311
- )
312
 
313
  with gr.Tabs():
314
- with gr.Tab("Image Input"):
315
- with gr.Row():
316
- with gr.Column(scale=2):
317
- image_input = gr.Image(type="pil", label="Input Image")
318
- image_submit_btn = gr.Button("Extract Text")
319
-
320
- # Image examples
321
- gr.Examples(
322
- examples=[[img] for img, _ in examples],
323
- inputs=[image_input],
324
- label="Example Images",
325
- examples_per_page=8
326
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
 
328
- with gr.Column(scale=3):
329
- with gr.Tabs():
330
- with gr.Tab("Extracted Text"):
331
- image_text_output = gr.Textbox(
332
- lines=5,
333
- label="Extracted Text",
334
- show_copy_button=True,
335
- rtl=True,
336
- elem_classes="textbox1"
 
 
 
 
 
 
 
 
337
  )
338
 
339
- with gr.Tab("Detected Text Regions"):
340
- image_bbox_output = gr.Gallery(
341
- label="Detected Text Regions",
342
- show_label=True,
343
- columns=2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  )
345
-
346
- with gr.Tab("PDF Input"):
347
- with gr.Row():
348
- with gr.Column(scale=2):
349
- pdf_input = gr.File(
350
- label="Input PDF",
351
- file_types=[".pdf"]
352
- )
353
- pdf_submit_btn = gr.Button("Extract Text")
354
-
355
- # PDF examples
356
- gr.Examples(
357
- examples=[
358
- ["example.pdf", "Example 1"],
359
- ], # Add PDF examples here if needed
360
- inputs=[pdf_input],
361
- label="Example PDFs",
362
- examples_per_page=8
363
- )
364
 
365
- with gr.Column(scale=3):
366
- with gr.Tabs():
367
- with gr.Tab("Extracted Text"):
368
- pdf_text_output = gr.Textbox(
369
- lines=5,
370
- label="Extracted Text",
371
- show_copy_button=True,
372
- rtl=True,
373
- elem_classes="textbox1"
374
  )
375
-
376
- with gr.Tab("Detected Text Regions"):
377
- pdf_bbox_output = gr.Gallery(
378
- label="Detected Text Regions",
379
- show_label=True,
380
- columns=2
 
 
 
 
381
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
 
383
- # Process image when button is clicked
384
- image_submit_btn.click(
385
- fn=process_image,
386
- inputs=[model_dropdown, image_input],
387
- outputs=[image_text_output, image_bbox_output]
388
  )
389
 
390
- # Process PDF when button is clicked
391
- pdf_submit_btn.click(
392
- fn=process_pdf,
393
- inputs=[pdf_input, model_dropdown],
394
- outputs=[pdf_text_output, pdf_bbox_output]
395
  )
396
 
 
 
 
 
 
397
 
398
  # Function to install requirements
399
  def install_requirements():
@@ -427,18 +270,16 @@ def install_requirements():
427
 
428
  # Launch the app
429
  if __name__ == "__main__":
430
- # First install requirements
431
  success = install_requirements()
432
  if success:
433
  print("All requirements installed successfully")
434
 
435
- from transformers.image_utils import load_image
436
- import torch
437
  from transformers import PaliGemmaForConditionalGeneration, AutoProcessor
438
  from peft import PeftModel, PeftConfig
439
 
440
- # Load the first model by default
441
- load_model(list(MODELS.keys())[0])
442
 
443
  #demo.launch(server_name="0.0.0.0", server_port=7812)
444
  demo.launch()
 
3
  import os
4
  import sys
5
  import subprocess
 
 
 
 
 
 
6
  import numpy as np
7
+ from paligemma2 import PaliGemma2Handler, MODELS as PALIGEMMA_MODELS
8
+ from gemma import GemmaHandler, MODELS as GEMMA_MODELS
9
 
10
+ # Initialize model handlers
11
+ paligemma_handler = PaliGemma2Handler()
12
+ gemma_handler = GemmaHandler()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  @spaces.GPU
15
+ def process_image_paligemma(model_name, image, progress=gr.Progress()):
16
+ """Process a single image with PaliGemma2"""
17
+ return paligemma_handler.process_image(model_name, image, progress)
 
 
 
 
 
 
 
 
 
18
 
19
+ @spaces.GPU
20
+ def process_image_gemma(model_name, image, progress=gr.Progress()):
21
+ """Process a single image with Gemma"""
22
+ return gemma_handler.process_image(model_name, image, progress)
23
 
24
+ @spaces.GPU
25
+ def process_pdf_paligemma(pdf_path, model_name, progress=gr.Progress()):
26
+ """Process a PDF file with PaliGemma2"""
27
+ return paligemma_handler.process_pdf(pdf_path, model_name, progress)
28
 
29
  @spaces.GPU
30
+ def process_pdf_gemma(pdf_path, model_name, progress=gr.Progress()):
31
+ """Process a PDF file with Gemma"""
32
+ return gemma_handler.process_pdf(pdf_path, model_name, progress)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  # Example images with descriptions
35
  examples = [
36
  ["type_1_sl.png", "Typed Dhivehi text sample 1"],
37
  ["type_2_sl.png", "Typed Dhivehi text sample 2"],
38
+ ["hw_1_sl.png", "Handwritten Dhivehi text sample 1"],
39
+ ["hw_2_sl.jpg", "Handwritten Dhivehi text sample 2"],
40
+ ["hw_3_sl.png", "Handwritten Dhivehi text sample 3"],
41
+ ["hw_4_sl.png", "Handwritten Dhivehi text sample 4"],
42
  ["ml.png", "Multi-line Dhivehi text sample"]
43
  ]
44
 
 
53
  }
54
  """
55
 
56
+ with gr.Blocks(title="Dhivehi Image to Text",css=css) as demo:
57
+ gr.Markdown("# Dhivehi Image to Text")
58
+ gr.Markdown("Dhivehi Image to Text experimental finetunes")
 
 
 
 
 
 
 
59
 
60
  with gr.Tabs():
61
+ with gr.Tab("PaliGemma2"):
62
+ model_dropdown_paligemma = gr.Dropdown(
63
+ choices=list(PALIGEMMA_MODELS.keys()),
64
+ value=list(PALIGEMMA_MODELS.keys())[0],
65
+ label="Select PaliGemma2 Model"
66
+ )
67
+
68
+ with gr.Tabs():
69
+ with gr.Tab("Image Input"):
70
+ with gr.Row():
71
+ with gr.Column(scale=2):
72
+ image_input_paligemma = gr.Image(type="pil", label="Input Image")
73
+ image_submit_btn_paligemma = gr.Button("Extract Text")
74
+
75
+ # Image examples
76
+ gr.Examples(
77
+ examples=[[img] for img, _ in examples],
78
+ inputs=[image_input_paligemma],
79
+ label="Example Images",
80
+ examples_per_page=8
81
+ )
82
+
83
+ with gr.Column(scale=3):
84
+ with gr.Tabs():
85
+ with gr.Tab("Extracted Text"):
86
+ image_text_output_paligemma = gr.Textbox(
87
+ lines=5,
88
+ label="Extracted Text",
89
+ show_copy_button=True,
90
+ rtl=True,
91
+ elem_classes="textbox1"
92
+ )
93
+
94
+ with gr.Tab("Detected Text Regions"):
95
+ image_bbox_output_paligemma = gr.Gallery(
96
+ label="Detected Text Regions",
97
+ show_label=True,
98
+ columns=2
99
+ )
100
 
101
+ with gr.Tab("PDF Input"):
102
+ with gr.Row():
103
+ with gr.Column(scale=2):
104
+ pdf_input_paligemma = gr.File(
105
+ label="Input PDF",
106
+ file_types=[".pdf"]
107
+ )
108
+ pdf_submit_btn_paligemma = gr.Button("Extract Text from PDF")
109
+
110
+ # PDF examples
111
+ gr.Examples(
112
+ examples=[
113
+ ["example.pdf", "Example 1"],
114
+ ],
115
+ inputs=[pdf_input_paligemma],
116
+ label="Example PDFs",
117
+ examples_per_page=8
118
  )
119
 
120
+ with gr.Column(scale=3):
121
+ with gr.Tabs():
122
+ with gr.Tab("Extracted Text"):
123
+ pdf_text_output_paligemma = gr.Textbox(
124
+ lines=5,
125
+ label="Extracted Text",
126
+ show_copy_button=True,
127
+ rtl=True,
128
+ elem_classes="textbox1"
129
+ )
130
+
131
+ with gr.Tab("Detected Text Regions"):
132
+ pdf_bbox_output_paligemma = gr.Gallery(
133
+ label="Detected Text Regions",
134
+ show_label=True,
135
+ columns=2
136
+ )
137
+
138
+ with gr.Tab("Gemma"):
139
+ model_dropdown_gemma = gr.Dropdown(
140
+ choices=list(GEMMA_MODELS.keys()),
141
+ value=list(GEMMA_MODELS.keys())[0],
142
+ label="Select Gemma Model"
143
+ )
144
+
145
+ with gr.Tabs():
146
+ with gr.Tab("Image Input"):
147
+ with gr.Row():
148
+ with gr.Column(scale=2):
149
+ image_input_gemma = gr.Image(type="pil", label="Input Image")
150
+ image_submit_btn_gemma = gr.Button("Extract Text")
151
+
152
+ # Image examples
153
+ gr.Examples(
154
+ examples=[[img] for img, _ in examples],
155
+ inputs=[image_input_gemma],
156
+ label="Example Images",
157
+ examples_per_page=8
158
  )
159
+
160
+ with gr.Column(scale=3):
161
+ with gr.Tabs():
162
+ with gr.Tab("Extracted Text"):
163
+ image_text_output_gemma = gr.Textbox(
164
+ lines=5,
165
+ label="Extracted Text",
166
+ show_copy_button=True,
167
+ rtl=True,
168
+ elem_classes="textbox1"
169
+ )
170
+
171
+ with gr.Tab("Detected Text Regions"):
172
+ image_bbox_output_gemma = gr.Gallery(
173
+ label="Detected Text Regions",
174
+ show_label=True,
175
+ columns=2
176
+ )
 
177
 
178
+ with gr.Tab("PDF Input"):
179
+ with gr.Row():
180
+ with gr.Column(scale=2):
181
+ pdf_input_gemma = gr.File(
182
+ label="Input PDF",
183
+ file_types=[".pdf"]
 
 
 
184
  )
185
+ pdf_submit_btn_gemma = gr.Button("Extract Text from PDF")
186
+
187
+ # PDF examples
188
+ gr.Examples(
189
+ examples=[
190
+ ["example.pdf", "Example 1"],
191
+ ],
192
+ inputs=[pdf_input_gemma],
193
+ label="Example PDFs",
194
+ examples_per_page=8
195
  )
196
+
197
+ with gr.Column(scale=3):
198
+ with gr.Tabs():
199
+ with gr.Tab("Extracted Text"):
200
+ pdf_text_output_gemma = gr.Textbox(
201
+ lines=5,
202
+ label="Extracted Text",
203
+ show_copy_button=True,
204
+ rtl=True,
205
+ elem_classes="textbox1"
206
+ )
207
+
208
+ with gr.Tab("Detected Text Regions"):
209
+ pdf_bbox_output_gemma = gr.Gallery(
210
+ label="Detected Text Regions",
211
+ show_label=True,
212
+ columns=2
213
+ )
214
+
215
+ # PaliGemma2 event handlers
216
+ image_submit_btn_paligemma.click(
217
+ fn=process_image_paligemma,
218
+ inputs=[model_dropdown_paligemma, image_input_paligemma],
219
+ outputs=[image_text_output_paligemma, image_bbox_output_paligemma]
220
+ )
221
 
222
+ pdf_submit_btn_paligemma.click(
223
+ fn=process_pdf_paligemma,
224
+ inputs=[pdf_input_paligemma, model_dropdown_paligemma],
225
+ outputs=[pdf_text_output_paligemma, pdf_bbox_output_paligemma]
 
226
  )
227
 
228
+ # Gemma event handlers
229
+ image_submit_btn_gemma.click(
230
+ fn=process_image_gemma,
231
+ inputs=[model_dropdown_gemma, image_input_gemma],
232
+ outputs=[image_text_output_gemma, image_bbox_output_gemma]
233
  )
234
 
235
+ pdf_submit_btn_gemma.click(
236
+ fn=process_pdf_gemma,
237
+ inputs=[pdf_input_gemma, model_dropdown_gemma],
238
+ outputs=[pdf_text_output_gemma, pdf_bbox_output_gemma]
239
+ )
240
 
241
  # Function to install requirements
242
  def install_requirements():
 
270
 
271
  # Launch the app
272
  if __name__ == "__main__":
273
+ # First install requirements
274
  success = install_requirements()
275
  if success:
276
  print("All requirements installed successfully")
277
 
 
 
278
  from transformers import PaliGemmaForConditionalGeneration, AutoProcessor
279
  from peft import PeftModel, PeftConfig
280
 
281
+ # Load the first PaliGemma2 model by default
282
+ #paligemma_handler.load_model(list(PALIGEMMA_MODELS.keys())[0])
283
 
284
  #demo.launch(server_name="0.0.0.0", server_port=7812)
285
  demo.launch()
gemma.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from PIL import Image, ImageDraw
3
+ from transformers import AutoProcessor, AutoModelForImageTextToText
4
+ from peft import PeftModel, PeftConfig
5
+ import numpy as np
6
+ from detector import TextDetector
7
+ import tempfile
8
+ import os
9
+
10
+ # List of available models with their IDs and prompts
11
+ MODELS = {
12
+ "Gemma-3 10k": {
13
+ "id": "alakxender/dhivehi-image-text-init10k-gemma",
14
+ "prompt": "Extract the dhivehi text from the image"
15
+ }
16
+ }
17
+
18
+ class GemmaHandler:
19
+ def __init__(self):
20
+ self.model = None
21
+ self.processor = None
22
+ self.current_model_name = None
23
+ self.detector = TextDetector()
24
+
25
+ def load_model(self, model_name):
26
+ """Load the model and processor"""
27
+ model_id = MODELS[model_name]['id']
28
+
29
+ # Load the model and processor
30
+ self.model = AutoModelForImageTextToText.from_pretrained(
31
+ model_id,
32
+ device_map="auto",
33
+ torch_dtype=torch.bfloat16
34
+ )
35
+ self.processor = AutoProcessor.from_pretrained(model_id)
36
+ self.current_model_name = model_name
37
+
38
+ def process_image(self, model_name, image, progress=None):
39
+ """Process a single image"""
40
+ if image is None:
41
+ return "", []
42
+
43
+ # Load model if different model selected
44
+ if model_name != self.current_model_name:
45
+ try:
46
+ if progress is not None:
47
+ progress(0, desc="Loading model...")
48
+ except:
49
+ pass
50
+ self.load_model(model_name)
51
+
52
+ if isinstance(image, np.ndarray):
53
+ image = Image.fromarray(image)
54
+
55
+ width, height = image.size
56
+ print(f"Image dimensions: {width}x{height}")
57
+
58
+ # Check if image proportions are similar to a single line
59
+ # Typical single line has width significantly larger than height
60
+ # and aspect ratio (width/height) greater than 3
61
+ aspect_ratio = width / height
62
+ if height <= 50 or aspect_ratio > 3:
63
+ try:
64
+ if progress is not None:
65
+ progress(0.5, desc="Processing single line...")
66
+ except:
67
+ pass
68
+ result = self.process_single_line(image, model_name)
69
+ try:
70
+ if progress is not None:
71
+ progress(1.0, desc="Done!")
72
+ except:
73
+ pass
74
+ return result, [image]
75
+ else:
76
+ return self.process_multi_line(image, model_name, progress)
77
+
78
+ def process_single_line(self, image, model_name):
79
+ """Process a single line of text"""
80
+ # Prepare the conversation format with instruction
81
+ messages = [
82
+ {
83
+ "role": "user",
84
+ "content": [
85
+ {"type": "text", "text": MODELS[model_name]["prompt"]},
86
+ {"type": "image", "image": image.convert("RGB")}
87
+ ],
88
+ }
89
+ ]
90
+
91
+ # Apply the chat template
92
+ prompt = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
93
+
94
+ # Process into tensors
95
+ inputs = self.processor(
96
+ text=prompt,
97
+ images=[image],
98
+ return_tensors="pt"
99
+ ).to(self.model.device)
100
+
101
+ # Generate text output
102
+ with torch.no_grad():
103
+ outputs = self.model.generate(**inputs, max_new_tokens=128)
104
+
105
+ decoded = self.processor.tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
106
+
107
+ # Cleanup: remove any extra prefixes or instruction leakage
108
+ for unwanted in ["user", "model", "Instruction:", MODELS[model_name]["prompt"]]:
109
+ decoded = decoded.replace(unwanted, "")
110
+ return decoded.strip()
111
+
112
+ def process_multi_line(self, image, model_name, progress=None):
113
+ """Process a multi-line image by detecting text regions and OCRing each region"""
114
+ # Create temporary directory
115
+ with tempfile.TemporaryDirectory() as temp_dir:
116
+ # Save input image
117
+ input_path = os.path.join(temp_dir, "input.png")
118
+ image.save(input_path)
119
+
120
+ # Initialize detector with temp directory
121
+ detector = TextDetector(output_dir=temp_dir)
122
+
123
+ # Run text detection
124
+ try:
125
+ if progress is not None:
126
+ progress(0.1, desc="Detecting text regions...")
127
+ except:
128
+ pass
129
+
130
+ results = detector.process_input(input_path, save_images=True)
131
+
132
+ # Get text regions for the image
133
+ regions = detector.get_text_regions(results, "input")
134
+ if not regions:
135
+ return "No text regions detected", []
136
+
137
+ # Process each text region
138
+ page_regions = regions[0] # First page
139
+ text_lines = page_regions.get('bboxes', [])
140
+
141
+ if not text_lines:
142
+ return "No text lines detected", []
143
+
144
+ # Sort text lines by y-coordinate (top to bottom)
145
+ text_lines.sort(key=lambda x: x['bbox'][1])
146
+
147
+ # Draw bounding boxes on the image
148
+ bbox_image = image.copy()
149
+ bbox_image = self.draw_bboxes(bbox_image, text_lines)
150
+
151
+ # Process each text line
152
+ all_text = []
153
+ total_lines = len(text_lines)
154
+
155
+ for i, line in enumerate(text_lines):
156
+ try:
157
+ if progress is not None:
158
+ progress((i + 1) / total_lines, desc=f"Processing line {i+1}/{total_lines}")
159
+ except:
160
+ pass
161
+
162
+ # Extract text region using bbox
163
+ x1, y1, x2, y2 = line['bbox']
164
+ line_image = image.crop((x1, y1, x2, y2))
165
+
166
+ # Process the line
167
+ line_text = self.process_single_line(line_image, model_name)
168
+ all_text.append(line_text)
169
+
170
+ try:
171
+ if progress is not None:
172
+ progress(1.0, desc="Done!")
173
+ except:
174
+ pass
175
+
176
+ return "\n".join(all_text), [bbox_image] # Return as list for gallery
177
+
178
+ def process_pdf(self, pdf_path, model_name, progress=None):
179
+ """Process a PDF file"""
180
+ if pdf_path is None:
181
+ return "", []
182
+
183
+ # Load model if different model selected
184
+ if model_name != self.current_model_name:
185
+ try:
186
+ if progress is not None:
187
+ progress(0, desc="Loading model...")
188
+ except:
189
+ pass
190
+ self.load_model(model_name)
191
+
192
+ # Create temporary directory
193
+ with tempfile.TemporaryDirectory() as temp_dir:
194
+ # Initialize detector with temp directory
195
+ self.detector.output_dir = temp_dir
196
+
197
+ # Run text detection on PDF (process first 2 pages)
198
+ try:
199
+ if progress is not None:
200
+ progress(0.1, desc="Detecting text regions in PDF...")
201
+ except:
202
+ pass
203
+
204
+ results = self.detector.process_input(pdf_path, save_images=True, page_range="0")
205
+
206
+ # Get text regions for the PDF
207
+ regions = self.detector.get_text_regions(results, os.path.splitext(os.path.basename(pdf_path))[0])
208
+ if not regions:
209
+ return "No text regions detected", []
210
+
211
+ # Process each page
212
+ all_text = []
213
+ bbox_images = []
214
+
215
+ # Get the base name of the PDF without extension
216
+ pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
217
+
218
+ for page_num, page_regions in enumerate(regions):
219
+ try:
220
+ if progress is not None:
221
+ progress(0.2 + (page_num/len(regions))*0.3, desc=f"Processing page {page_num+1}/{len(regions)}...")
222
+ except:
223
+ pass
224
+
225
+ # Try different possible paths for the page image
226
+ possible_paths = [
227
+ os.path.join(temp_dir, pdf_name, f"{pdf_name}_{page_num}_bbox.png"), # Detector's actual path
228
+ os.path.join(temp_dir, pdf_name, f"page_{page_num}.png"), # Original path
229
+ os.path.join(temp_dir, f"page_{page_num}.png"), # Direct in output dir
230
+ os.path.join(temp_dir, f"{pdf_name}_page_{page_num}.png") # Alternative naming
231
+ ]
232
+
233
+ page_image = None
234
+ for page_image_path in possible_paths:
235
+ if os.path.exists(page_image_path):
236
+ page_image = Image.open(page_image_path)
237
+ break
238
+
239
+ if page_image is None:
240
+ all_text.append(f"\nPage {page_num+1}: Page image not found. Tried paths:\n" +
241
+ "\n".join(f"- {path}" for path in possible_paths))
242
+ continue
243
+
244
+ text_lines = page_regions.get('bboxes', [])
245
+ if not text_lines:
246
+ all_text.append(f"\nPage {page_num+1}: No text lines detected")
247
+ continue
248
+
249
+ # Sort text lines by y-coordinate (top to bottom)
250
+ text_lines.sort(key=lambda x: x['bbox'][1])
251
+
252
+ # Draw bounding boxes on the image
253
+ bbox_image = page_image.copy()
254
+ bbox_image = self.draw_bboxes(bbox_image, text_lines)
255
+ bbox_images.append(bbox_image)
256
+
257
+ # Process each text line
258
+ page_text = []
259
+ total_lines = len(text_lines)
260
+
261
+ for i, line in enumerate(text_lines):
262
+ try:
263
+ if progress is not None:
264
+ progress(0.5 + (page_num/len(regions))*0.2 + (i/total_lines)*0.3,
265
+ desc=f"Processing line {i+1}/{total_lines} on page {page_num+1}/{len(regions)}...")
266
+ except:
267
+ pass
268
+
269
+ # Extract text region using bbox
270
+ x1, y1, x2, y2 = line['bbox']
271
+ line_image = page_image.crop((x1, y1, x2, y2))
272
+
273
+ # Process the line
274
+ line_text = self.process_single_line(line_image, model_name)
275
+ page_text.append(line_text)
276
+
277
+ # Add page text without page number
278
+ all_text.extend(page_text)
279
+
280
+ try:
281
+ if progress is not None:
282
+ progress(1.0, desc="Done!")
283
+ except:
284
+ pass
285
+
286
+ return "\n".join(all_text), bbox_images # Return list of bbox images
287
+
288
+ @staticmethod
289
+ def draw_bboxes(image, text_lines):
290
+ """Draw bounding boxes on the image"""
291
+ draw = ImageDraw.Draw(image)
292
+ for line in text_lines:
293
+ # Draw polygon - flatten nested coordinates
294
+ polygon = line['polygon']
295
+ flat_polygon = [coord for point in polygon for coord in point]
296
+ draw.polygon(flat_polygon, outline="red", width=2)
297
+
298
+ # Draw bbox
299
+ x1, y1, x2, y2 = line['bbox']
300
+ draw.rectangle([x1, y1, x2, y2], outline="blue", width=1)
301
+
302
+ # Draw confidence score
303
+ draw.text((x1, y1 - 10), f"{line['confidence']:.2f}", fill="red")
304
+ return image
paligemma2.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from PIL import Image, ImageDraw
3
+ from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
4
+ from peft import PeftModel, PeftConfig
5
+ import numpy as np
6
+ from detector import TextDetector
7
+ import tempfile
8
+ import os
9
+
10
+ # List of available models with their IDs and prompts
11
+ MODELS = {
12
+ "Medium-14k, Single Line": {
13
+ "id": "alakxender/paligemma2-qlora-dhivehi-ocr-224-sl-14k",
14
+ "prompt": "What text is written in this image?"
15
+ },
16
+ "Medium-16k, Single Line": {
17
+ "id": "alakxender/paligemma2-qlora-dhivehi-ocr-224-sl-md-16k",
18
+ "prompt": "What text is written in this image?"
19
+ },
20
+ "Small, Single Line": {
21
+ "id": "alakxender/paligemma2-qlora-vrd-dhivehi-ocr-224-sm",
22
+ "prompt": "What text is written in this image?"
23
+ }
24
+ }
25
+
26
+ class PaliGemma2Handler:
27
+ def __init__(self):
28
+ self.model = None
29
+ self.processor = None
30
+ self.current_model_name = None
31
+ self.detector = TextDetector()
32
+
33
+ def load_model(self, model_name):
34
+ """Load the model and processor"""
35
+ model_id = MODELS[model_name]['id']
36
+
37
+ # Load the PEFT configuration to get the base model path
38
+ peft_config = PeftConfig.from_pretrained(model_id)
39
+
40
+ # Load the base model
41
+ base_model = PaliGemmaForConditionalGeneration.from_pretrained(
42
+ peft_config.base_model_name_or_path,
43
+ device_map="auto",
44
+ torch_dtype=torch.bfloat16
45
+ )
46
+
47
+ # Load the adapter on top of the base model
48
+ self.model = PeftModel.from_pretrained(base_model, model_id)
49
+ self.processor = AutoProcessor.from_pretrained(peft_config.base_model_name_or_path)
50
+ self.current_model_name = model_name
51
+
52
+ def process_image(self, model_name, image, progress=None):
53
+ """Process a single image"""
54
+ if image is None:
55
+ return "", []
56
+
57
+ # Load model if different model selected
58
+ if model_name != self.current_model_name:
59
+ try:
60
+ if progress is not None:
61
+ progress(0, desc="Loading model...")
62
+ except:
63
+ pass
64
+ self.load_model(model_name)
65
+
66
+ if isinstance(image, np.ndarray):
67
+ image = Image.fromarray(image)
68
+
69
+ width, height = image.size
70
+ print(f"Image dimensions: {width}x{height}")
71
+
72
+ # Check if image proportions are similar to a single line
73
+ # Typical single line has width significantly larger than height
74
+ # and aspect ratio (width/height) greater than 3
75
+ aspect_ratio = width / height
76
+ if height <= 50 or aspect_ratio > 3:
77
+ try:
78
+ if progress is not None:
79
+ progress(0.5, desc="Processing single line...")
80
+ except:
81
+ pass
82
+ result = self.process_single_line(image, model_name)
83
+ try:
84
+ if progress is not None:
85
+ progress(1.0, desc="Done!")
86
+ except:
87
+ pass
88
+ return result, [image]
89
+ else:
90
+ return self.process_multi_line(image, model_name, progress)
91
+
92
+ def process_single_line(self, image, model_name):
93
+ """Process a single line of text"""
94
+ prompt = MODELS[model_name]["prompt"]
95
+ # Add image token to prompt
96
+ prompt = f"<image>{prompt}"
97
+
98
+ # First prepare inputs without moving to CUDA
99
+ model_inputs = self.processor(text=prompt, images=image, return_tensors="pt")
100
+
101
+ # Then move to CUDA and convert only image tensors to bfloat16
102
+ for k, v in model_inputs.items():
103
+ if k == "pixel_values":
104
+ model_inputs[k] = v.to(torch.bfloat16).to("cuda")
105
+ else:
106
+ model_inputs[k] = v.to("cuda")
107
+
108
+ outputs = self.model.generate(
109
+ **model_inputs,
110
+ max_new_tokens=500,
111
+ do_sample=False
112
+ )
113
+
114
+ generated_text = self.processor.batch_decode(outputs, skip_special_tokens=True)[0]
115
+ # Remove the prompt and any leading/trailing whitespace
116
+ cleaned_text = generated_text.replace(prompt, "").strip()
117
+ # Remove any remaining question marks or other artifacts
118
+ cleaned_text = cleaned_text.lstrip("?").strip()
119
+ # Remove the prompt text if it somehow appears in the output
120
+ cleaned_text = cleaned_text.replace("What text is written in this image?", "").strip()
121
+ return cleaned_text
122
+
123
+ def process_multi_line(self, image, model_name, progress=None):
124
+ """Process a multi-line image by detecting text regions and OCRing each region"""
125
+ # Create temporary directory
126
+ with tempfile.TemporaryDirectory() as temp_dir:
127
+ # Save input image
128
+ input_path = os.path.join(temp_dir, "input.png")
129
+ image.save(input_path)
130
+
131
+ # Initialize detector with temp directory
132
+ detector = TextDetector(output_dir=temp_dir)
133
+
134
+ # Run text detection
135
+ try:
136
+ if progress is not None:
137
+ progress(0.1, desc="Detecting text regions...")
138
+ except:
139
+ pass
140
+
141
+ results = detector.process_input(input_path, save_images=True)
142
+
143
+ # Get text regions for the image
144
+ regions = detector.get_text_regions(results, "input")
145
+ if not regions:
146
+ return "No text regions detected", []
147
+
148
+ # Process each text region
149
+ page_regions = regions[0] # First page
150
+ text_lines = page_regions.get('bboxes', [])
151
+
152
+ if not text_lines:
153
+ return "No text lines detected", []
154
+
155
+ # Sort text lines by y-coordinate (top to bottom)
156
+ text_lines.sort(key=lambda x: x['bbox'][1])
157
+
158
+ # Draw bounding boxes on the image
159
+ bbox_image = image.copy()
160
+ bbox_image = self.draw_bboxes(bbox_image, text_lines)
161
+
162
+ # Process each text line
163
+ all_text = []
164
+ total_lines = len(text_lines)
165
+
166
+ for i, line in enumerate(text_lines):
167
+ try:
168
+ if progress is not None:
169
+ progress((i + 1) / total_lines, desc=f"Processing line {i+1}/{total_lines}")
170
+ except:
171
+ pass
172
+
173
+ # Extract text region using bbox
174
+ x1, y1, x2, y2 = line['bbox']
175
+ line_image = image.crop((x1, y1, x2, y2))
176
+
177
+ # Process the line
178
+ line_text = self.process_single_line(line_image, model_name)
179
+ all_text.append(line_text)
180
+
181
+ try:
182
+ if progress is not None:
183
+ progress(1.0, desc="Done!")
184
+ except:
185
+ pass
186
+
187
+ return "\n".join(all_text), [bbox_image] # Return as list for gallery
188
+
189
+ def process_pdf(self, pdf_path, model_name, progress=None):
190
+ """Process a PDF file"""
191
+ if pdf_path is None:
192
+ return "", []
193
+
194
+ # Load model if different model selected
195
+ if model_name != self.current_model_name:
196
+ try:
197
+ if progress is not None:
198
+ progress(0, desc="Loading model...")
199
+ except:
200
+ pass
201
+ self.load_model(model_name)
202
+
203
+ # Create temporary directory
204
+ with tempfile.TemporaryDirectory() as temp_dir:
205
+ # Initialize detector with temp directory
206
+ self.detector.output_dir = temp_dir
207
+
208
+ # Run text detection on PDF (process first 2 pages)
209
+ try:
210
+ if progress is not None:
211
+ progress(0.1, desc="Detecting text regions in PDF...")
212
+ except:
213
+ pass
214
+
215
+ results = self.detector.process_input(pdf_path, save_images=True, page_range="0")
216
+
217
+ # Get text regions for the PDF
218
+ regions = self.detector.get_text_regions(results, os.path.splitext(os.path.basename(pdf_path))[0])
219
+ if not regions:
220
+ return "No text regions detected", []
221
+
222
+ # Process each page
223
+ all_text = []
224
+ bbox_images = []
225
+
226
+ # Get the base name of the PDF without extension
227
+ pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
228
+
229
+ for page_num, page_regions in enumerate(regions):
230
+ try:
231
+ if progress is not None:
232
+ progress(0.2 + (page_num/len(regions))*0.3, desc=f"Processing page {page_num+1}/{len(regions)}...")
233
+ except:
234
+ pass
235
+
236
+ # Try different possible paths for the page image
237
+ possible_paths = [
238
+ os.path.join(temp_dir, pdf_name, f"{pdf_name}_{page_num}_bbox.png"), # Detector's actual path
239
+ os.path.join(temp_dir, pdf_name, f"page_{page_num}.png"), # Original path
240
+ os.path.join(temp_dir, f"page_{page_num}.png"), # Direct in output dir
241
+ os.path.join(temp_dir, f"{pdf_name}_page_{page_num}.png") # Alternative naming
242
+ ]
243
+
244
+ page_image = None
245
+ for page_image_path in possible_paths:
246
+ if os.path.exists(page_image_path):
247
+ page_image = Image.open(page_image_path)
248
+ break
249
+
250
+ if page_image is None:
251
+ all_text.append(f"\nPage {page_num+1}: Page image not found. Tried paths:\n" +
252
+ "\n".join(f"- {path}" for path in possible_paths))
253
+ continue
254
+
255
+ text_lines = page_regions.get('bboxes', [])
256
+ if not text_lines:
257
+ all_text.append(f"\nPage {page_num+1}: No text lines detected")
258
+ continue
259
+
260
+ # Sort text lines by y-coordinate (top to bottom)
261
+ text_lines.sort(key=lambda x: x['bbox'][1])
262
+
263
+ # Draw bounding boxes on the image
264
+ bbox_image = page_image.copy()
265
+ bbox_image = self.draw_bboxes(bbox_image, text_lines)
266
+ bbox_images.append(bbox_image)
267
+
268
+ # Process each text line
269
+ page_text = []
270
+ total_lines = len(text_lines)
271
+
272
+ for i, line in enumerate(text_lines):
273
+ try:
274
+ if progress is not None:
275
+ progress(0.5 + (page_num/len(regions))*0.2 + (i/total_lines)*0.3,
276
+ desc=f"Processing line {i+1}/{total_lines} on page {page_num+1}/{len(regions)}...")
277
+ except:
278
+ pass
279
+
280
+ # Extract text region using bbox
281
+ x1, y1, x2, y2 = line['bbox']
282
+ line_image = page_image.crop((x1, y1, x2, y2))
283
+
284
+ # Process the line
285
+ line_text = self.process_single_line(line_image, model_name)
286
+ page_text.append(line_text)
287
+
288
+ # Add page text without page number
289
+ all_text.extend(page_text)
290
+
291
+ try:
292
+ if progress is not None:
293
+ progress(1.0, desc="Done!")
294
+ except:
295
+ pass
296
+
297
+ return "\n".join(all_text), bbox_images # Return list of bbox images
298
+
299
+ @staticmethod
300
+ def draw_bboxes(image, text_lines):
301
+ """Draw bounding boxes on the image"""
302
+ draw = ImageDraw.Draw(image)
303
+ for line in text_lines:
304
+ # Draw polygon - flatten nested coordinates
305
+ polygon = line['polygon']
306
+ flat_polygon = [coord for point in polygon for coord in point]
307
+ draw.polygon(flat_polygon, outline="red", width=2)
308
+
309
+ # Draw bbox
310
+ x1, y1, x2, y2 = line['bbox']
311
+ draw.rectangle([x1, y1, x2, y2], outline="blue", width=1)
312
+
313
+ # Draw confidence score
314
+ draw.text((x1, y1 - 10), f"{line['confidence']:.2f}", fill="red")
315
+ return image