Spaces:

alakxender
/

dhivehi-ocr

Running on Zero

App Files Files Community

alakxender commited on Jun 19

Commit

97bb8f1

1 Parent(s): edd2900

g

Browse files

Files changed (3) hide show

app.py +196 -355
gemma.py +304 -0
paligemma2.py +315 -0

app.py CHANGED Viewed

@@ -3,288 +3,42 @@ import gradio as gr
 import os
 import sys
 import subprocess
-from PIL import Image, ImageDraw
-from detector import TextDetector
-import tempfile
-import shutil
-import json
-from datetime import datetime
 import numpy as np
-# List of available models with their IDs and prompts
-MODELS = {
-    "Medium-14k, Single Line": { # /lab/mx01/md/sl-14/ft/
-        "id": "alakxender/paligemma2-qlora-dhivehi-ocr-224-sl-14k",
-        "prompt": "What text is written in this image?"
-    },
-    "Medium-16k, Single Line": { # /lab/mx01/md/sl-16/ft/
-        "id": "alakxender/paligemma2-qlora-dhivehi-ocr-224-sl-md-16k",
-        "prompt": "What text is written in this image?"
-    },
-    "Small, Single Line": { # /lab/mx01/sm/sl/ft/
-        "id": "alakxender/paligemma2-qlora-vrd-dhivehi-ocr-224-sm",
-        "prompt": "What text is written in this image?"
-    }
-}
-""" "Full Text": { # /lab/mx01/pr/sl/ft/
-        "id": "alakxender/paligemma2-qlora-dhivehi-ocr-224-mx01",
-        "prompt": "What text is written in this image?",
-    } ,
-    Full Text": { # /lab/mx01/pr/sl/ft/
-        "id": "alakxender/paligemma2-qlora-dhivehi-ocr-448-mx01",
-        "prompt": "OCR",
-    }
-    ,
-    Final": { # /lab/mx01/pr/sl/ft-final/
-        "id": "alakxender/paligemma2-dhivehi-ocr-448-mx01-final",
-        "prompt": "OCR", # smaller the better: 3k vrd, 3k printed, 3k handwritten, 1k single line
-    }"""
-# Global model state
-model = None
-processor = None
-current_model_name = None
-detector = TextDetector()
-def load_model(model_name):
-    """Load the model and processor"""
-    global model, processor, current_model_name
-    model_id = MODELS[model_name]['id']
-    # Load the PEFT configuration to get the base model path
-    peft_config = PeftConfig.from_pretrained(model_id)
-    # Load the base model
-    base_model = PaliGemmaForConditionalGeneration.from_pretrained(
-        peft_config.base_model_name_or_path,
-        device_map="auto",
-        torch_dtype=torch.bfloat16
-    )
-    # Load the adapter on top of the base model
-    model = PeftModel.from_pretrained(base_model, model_id)
-    processor = AutoProcessor.from_pretrained(peft_config.base_model_name_or_path)
-    current_model_name = model_name
-def process_single_line(image, model_name):
-    """Process a single line of text"""
-    prompt = MODELS[model_name]["prompt"]
-    # Add image token to prompt
-    prompt = f"<image>{prompt}"
-    # First prepare inputs without moving to CUDA
-    model_inputs = processor(text=prompt, images=image, return_tensors="pt")
-    # Then move to CUDA and convert only image tensors to bfloat16
-    for k, v in model_inputs.items():
-        if k == "pixel_values":
-            model_inputs[k] = v.to(torch.bfloat16).to("cuda")
-        else:
-            model_inputs[k] = v.to("cuda")
-    outputs = model.generate(
-        **model_inputs,
-        max_new_tokens=500,
-        do_sample=False
-    )
-    generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
-    # Remove the prompt and any leading/trailing whitespace
-    cleaned_text = generated_text.replace(prompt, "").strip()
-    # Remove any remaining question marks or other artifacts
-    cleaned_text = cleaned_text.lstrip("?").strip()
-    # Remove the prompt text if it somehow appears in the output
-    cleaned_text = cleaned_text.replace("What text is written in this image?", "").strip()
-    return cleaned_text
-def draw_bboxes(image, text_lines):
-    """Draw bounding boxes on the image"""
-    draw = ImageDraw.Draw(image)
-    for line in text_lines:
-        # Draw polygon - flatten nested coordinates
-        polygon = line['polygon']
-        flat_polygon = [coord for point in polygon for coord in point]
-        draw.polygon(flat_polygon, outline="red", width=2)
-        # Draw bbox
-        x1, y1, x2, y2 = line['bbox']
-        draw.rectangle([x1, y1, x2, y2], outline="blue", width=1)
-        # Draw confidence score
-        draw.text((x1, y1 - 10), f"{line['confidence']:.2f}", fill="red")
-    return image
-def process_multi_line(image, model_name, progress=gr.Progress()):
-    """Process a multi-line image by detecting text regions and OCRing each region"""
-    # Create temporary directory
-    with tempfile.TemporaryDirectory() as temp_dir:
-        # Save input image
-        input_path = os.path.join(temp_dir, "input.png")
-        image.save(input_path)
-        # Initialize detector with temp directory
-        detector = TextDetector(output_dir=temp_dir)
-        # Run text detection
-        progress(0.1, desc="Detecting text regions...")
-        results = detector.process_input(input_path, save_images=True)
-        # Get text regions for the image
-        regions = detector.get_text_regions(results, "input")
-        if not regions:
-            return "No text regions detected", []
-        # Process each text region
-        page_regions = regions[0]  # First page
-        text_lines = page_regions.get('bboxes', [])
-        if not text_lines:
-            return "No text lines detected", []
-        # Sort text lines by y-coordinate (top to bottom)
-        text_lines.sort(key=lambda x: x['bbox'][1])
-        # Draw bounding boxes on the image
-        bbox_image = image.copy()
-        bbox_image = draw_bboxes(bbox_image, text_lines)
-        # Process each text line
-        all_text = []
-        total_lines = len(text_lines)
-        for i, line in enumerate(text_lines):
-            progress(0.2 + (i/total_lines)*0.8, desc=f"Processing line {i+1}/{total_lines}...")
-            # Extract text region using bbox
-            x1, y1, x2, y2 = line['bbox']
-            line_image = image.crop((x1, y1, x2, y2))
-            # Process the line
-            line_text = process_single_line(line_image, model_name)
-            all_text.append(line_text)
-        progress(1.0, desc="Done!")
-        return "\n".join(all_text), [bbox_image]  # Return as list for gallery
 @spaces.GPU
-def process_image(model_name, image, progress=gr.Progress()):
-    """Process a single image"""
-    if image is None:
-        return "", []
-    # Load model if different model selected
-    if model_name != current_model_name:
-        progress(0, desc="Loading model...")
-        load_model(model_name)
-    if isinstance(image, np.ndarray):
-        image = Image.fromarray(image)
-    width, height = image.size
-    print(f"Image dimensions: {width}x{height}")
-    if height > 50:
-        return process_multi_line(image, model_name, progress)
-    else:
-        return process_single_line(image, model_name), [image]
 @spaces.GPU
-def process_pdf(pdf_path, model_name, progress=gr.Progress()):
-    """Process a PDF file"""
-    if pdf_path is None:
-        return "", []
-    # Load model if different model selected
-    if model_name != current_model_name:
-        progress(0, desc="Loading model...")
-        load_model(model_name)
-    # Create temporary directory
-    with tempfile.TemporaryDirectory() as temp_dir:
-        # Initialize detector with temp directory
-        detector = TextDetector(output_dir=temp_dir)
-        # Run text detection on PDF (process first 2 pages)
-        progress(0.1, desc="Detecting text regions in PDF...")
-        results = detector.process_input(pdf_path, save_images=True, page_range="0")
-        # Get text regions for the PDF
-        regions = detector.get_text_regions(results, os.path.splitext(os.path.basename(pdf_path))[0])
-        if not regions:
-            return "No text regions detected", []
-        # Process each page
-        all_text = []
-        bbox_images = []
-        # Get the base name of the PDF without extension
-        pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
-        for page_num, page_regions in enumerate(regions):
-            progress(0.2 + (page_num/2)*0.3, desc=f"Processing page {page_num+1}...")
-            # Try different possible paths for the page image
-            possible_paths = [
-                os.path.join(temp_dir, pdf_name, f"{pdf_name}_{page_num}_bbox.png"),  # Detector's actual path
-                os.path.join(temp_dir, pdf_name, f"page_{page_num}.png"),  # Original path
-                os.path.join(temp_dir, f"page_{page_num}.png"),  # Direct in output dir
-                os.path.join(temp_dir, f"{pdf_name}_page_{page_num}.png")  # Alternative naming
-            ]
-            page_image = None
-            for page_image_path in possible_paths:
-                if os.path.exists(page_image_path):
-                    page_image = Image.open(page_image_path)
-                    break
-            if page_image is None:
-                all_text.append(f"\nPage {page_num+1}: Page image not found. Tried paths:\n" +
-                              "\n".join(f"- {path}" for path in possible_paths))
-                continue
-            text_lines = page_regions.get('bboxes', [])
-            if not text_lines:
-                all_text.append(f"\nPage {page_num+1}: No text lines detected")
-                continue
-            # Sort text lines by y-coordinate (top to bottom)
-            text_lines.sort(key=lambda x: x['bbox'][1])
-            # Draw bounding boxes on the image
-            bbox_image = page_image.copy()
-            bbox_image = draw_bboxes(bbox_image, text_lines)
-            bbox_images.append(bbox_image)
-            # Process each text line
-            page_text = []
-            total_lines = len(text_lines)
-            for i, line in enumerate(text_lines):
-                progress(0.5 + (page_num/2)*0.2 + (i/total_lines)*0.3,
-                        desc=f"Processing line {i+1}/{total_lines} on page {page_num+1}...")
-                # Extract text region using bbox
-                x1, y1, x2, y2 = line['bbox']
-                line_image = page_image.crop((x1, y1, x2, y2))
-                # Process the line
-                line_text = process_single_line(line_image, model_name)
-                page_text.append(line_text)
-            # Add page text without page number
-            all_text.extend(page_text)
-        progress(1.0, desc="Done!")
-        return "\n".join(all_text), bbox_images  # Return list of bbox images
 # Example images with descriptions
 examples = [
     ["type_1_sl.png", "Typed Dhivehi text sample 1"],
     ["type_2_sl.png", "Typed Dhivehi text sample 2"],
-    ["hw_1_sl.png", "Handwritten Dhivehi text sample 1"], # exp this
-    ["hw_2_sl.jpg", "Handwritten Dhivehi text sample 2"],  # exp val3
-    ["hw_3_sl.png", "Handwritten Dhivehi text sample 3"],  # exp val2
-    ["hw_4_sl.png", "Handwritten Dhivehi text sample 4"], # exp val1
     ["ml.png", "Multi-line Dhivehi text sample"]
 ]
@@ -299,101 +53,190 @@ css = """
 }
 """
-with gr.Blocks(title="Dhivehi OCR",css=css) as demo:
-    gr.Markdown("# Dhivehi OCR")
-    gr.Markdown("Thaana OCR experimental finetunes")
-    with gr.Row():
-        model_dropdown = gr.Dropdown(
-            choices=list(MODELS.keys()),
-            value=list(MODELS.keys())[0], # Default to first model
-            label="Select Model"
-        )
     with gr.Tabs():
-        with gr.Tab("Image Input"):
-            with gr.Row():
-                with gr.Column(scale=2):
-                    image_input = gr.Image(type="pil", label="Input Image")
-                    image_submit_btn = gr.Button("Extract Text")
-                    # Image examples
-                    gr.Examples(
-                        examples=[[img] for img, _ in examples],
-                        inputs=[image_input],
-                        label="Example Images",
-                        examples_per_page=8
-                    )
-                with gr.Column(scale=3):
-                    with gr.Tabs():
-                        with gr.Tab("Extracted Text"):
-                            image_text_output = gr.Textbox(
-                                lines=5,
-                                label="Extracted Text",
-                                show_copy_button=True,
-                                rtl=True,
-                                elem_classes="textbox1"
                             )
-                        with gr.Tab("Detected Text Regions"):
-                            image_bbox_output = gr.Gallery(
-                                label="Detected Text Regions",
-                                show_label=True,
-                                columns=2
                             )
-        with gr.Tab("PDF Input"):
-            with gr.Row():
-                with gr.Column(scale=2):
-                    pdf_input = gr.File(
-                        label="Input PDF",
-                        file_types=[".pdf"]
-                    )
-                    pdf_submit_btn = gr.Button("Extract Text")
-                    # PDF examples
-                    gr.Examples(
-                        examples=[
-                            ["example.pdf", "Example 1"],
-                        ],  # Add PDF examples here if needed
-                        inputs=[pdf_input],
-                        label="Example PDFs",
-                        examples_per_page=8
-                    )
-                with gr.Column(scale=3):
-                    with gr.Tabs():
-                        with gr.Tab("Extracted Text"):
-                            pdf_text_output = gr.Textbox(
-                                lines=5,
-                                label="Extracted Text",
-                                show_copy_button=True,
-                                rtl=True,
-                                elem_classes="textbox1"
                             )
-                        with gr.Tab("Detected Text Regions"):
-                            pdf_bbox_output = gr.Gallery(
-                                label="Detected Text Regions",
-                                show_label=True,
-                                columns=2
                             )
-    # Process image when button is clicked
-    image_submit_btn.click(
-        fn=process_image,
-        inputs=[model_dropdown, image_input],
-        outputs=[image_text_output, image_bbox_output]
     )
-    # Process PDF when button is clicked
-    pdf_submit_btn.click(
-        fn=process_pdf,
-        inputs=[pdf_input, model_dropdown],
-        outputs=[pdf_text_output, pdf_bbox_output]
     )
 # Function to install requirements
 def install_requirements():
@@ -427,18 +270,16 @@ def install_requirements():
 # Launch the app
 if __name__ == "__main__":
-     # First install requirements
     success = install_requirements()
     if success:
         print("All requirements installed successfully")
-        from transformers.image_utils import load_image
-        import torch
         from transformers import PaliGemmaForConditionalGeneration, AutoProcessor
         from peft import PeftModel, PeftConfig
-        # Load the first model by default
-        load_model(list(MODELS.keys())[0])
         #demo.launch(server_name="0.0.0.0", server_port=7812)
         demo.launch()

 import os
 import sys
 import subprocess
 import numpy as np
+from paligemma2 import PaliGemma2Handler, MODELS as PALIGEMMA_MODELS
+from gemma import GemmaHandler, MODELS as GEMMA_MODELS
+# Initialize model handlers
+paligemma_handler = PaliGemma2Handler()
+gemma_handler = GemmaHandler()
 @spaces.GPU
+def process_image_paligemma(model_name, image, progress=gr.Progress()):
+    """Process a single image with PaliGemma2"""
+    return paligemma_handler.process_image(model_name, image, progress)
+@spaces.GPU
+def process_image_gemma(model_name, image, progress=gr.Progress()):
+    """Process a single image with Gemma"""
+    return gemma_handler.process_image(model_name, image, progress)
+@spaces.GPU
+def process_pdf_paligemma(pdf_path, model_name, progress=gr.Progress()):
+    """Process a PDF file with PaliGemma2"""
+    return paligemma_handler.process_pdf(pdf_path, model_name, progress)
 @spaces.GPU
+def process_pdf_gemma(pdf_path, model_name, progress=gr.Progress()):
+    """Process a PDF file with Gemma"""
+    return gemma_handler.process_pdf(pdf_path, model_name, progress)
 # Example images with descriptions
 examples = [
     ["type_1_sl.png", "Typed Dhivehi text sample 1"],
     ["type_2_sl.png", "Typed Dhivehi text sample 2"],
+    ["hw_1_sl.png", "Handwritten Dhivehi text sample 1"],
+    ["hw_2_sl.jpg", "Handwritten Dhivehi text sample 2"],
+    ["hw_3_sl.png", "Handwritten Dhivehi text sample 3"],
+    ["hw_4_sl.png", "Handwritten Dhivehi text sample 4"],
     ["ml.png", "Multi-line Dhivehi text sample"]
 ]
 }
 """
+with gr.Blocks(title="Dhivehi Image to Text",css=css) as demo:
+    gr.Markdown("# Dhivehi Image to Text")
+    gr.Markdown("Dhivehi Image to Text experimental finetunes")
     with gr.Tabs():
+        with gr.Tab("PaliGemma2"):
+            model_dropdown_paligemma = gr.Dropdown(
+                choices=list(PALIGEMMA_MODELS.keys()),
+                value=list(PALIGEMMA_MODELS.keys())[0],
+                label="Select PaliGemma2 Model"
+            )
+            with gr.Tabs():
+                with gr.Tab("Image Input"):
+                    with gr.Row():
+                        with gr.Column(scale=2):
+                            image_input_paligemma = gr.Image(type="pil", label="Input Image")
+                            image_submit_btn_paligemma = gr.Button("Extract Text")
+                            # Image examples
+                            gr.Examples(
+                                examples=[[img] for img, _ in examples],
+                                inputs=[image_input_paligemma],
+                                label="Example Images",
+                                examples_per_page=8
+                            )
+                        with gr.Column(scale=3):
+                            with gr.Tabs():
+                                with gr.Tab("Extracted Text"):
+                                    image_text_output_paligemma = gr.Textbox(
+                                        lines=5,
+                                        label="Extracted Text",
+                                        show_copy_button=True,
+                                        rtl=True,
+                                        elem_classes="textbox1"
+                                    )
+                                with gr.Tab("Detected Text Regions"):
+                                    image_bbox_output_paligemma = gr.Gallery(
+                                        label="Detected Text Regions",
+                                        show_label=True,
+                                        columns=2
+                                    )
+                with gr.Tab("PDF Input"):
+                    with gr.Row():
+                        with gr.Column(scale=2):
+                            pdf_input_paligemma = gr.File(
+                                label="Input PDF",
+                                file_types=[".pdf"]
+                            )
+                            pdf_submit_btn_paligemma = gr.Button("Extract Text from PDF")
+                            # PDF examples
+                            gr.Examples(
+                                examples=[
+                                    ["example.pdf", "Example 1"],
+                                ],
+                                inputs=[pdf_input_paligemma],
+                                label="Example PDFs",
+                                examples_per_page=8
                             )
+                        with gr.Column(scale=3):
+                            with gr.Tabs():
+                                with gr.Tab("Extracted Text"):
+                                    pdf_text_output_paligemma = gr.Textbox(
+                                        lines=5,
+                                        label="Extracted Text",
+                                        show_copy_button=True,
+                                        rtl=True,
+                                        elem_classes="textbox1"
+                                    )
+                                with gr.Tab("Detected Text Regions"):
+                                    pdf_bbox_output_paligemma = gr.Gallery(
+                                        label="Detected Text Regions",
+                                        show_label=True,
+                                        columns=2
+                                    )
+        with gr.Tab("Gemma"):
+            model_dropdown_gemma = gr.Dropdown(
+                choices=list(GEMMA_MODELS.keys()),
+                value=list(GEMMA_MODELS.keys())[0],
+                label="Select Gemma Model"
+            )
+            with gr.Tabs():
+                with gr.Tab("Image Input"):
+                    with gr.Row():
+                        with gr.Column(scale=2):
+                            image_input_gemma = gr.Image(type="pil", label="Input Image")
+                            image_submit_btn_gemma = gr.Button("Extract Text")
+                            # Image examples
+                            gr.Examples(
+                                examples=[[img] for img, _ in examples],
+                                inputs=[image_input_gemma],
+                                label="Example Images",
+                                examples_per_page=8
                             )
+                        with gr.Column(scale=3):
+                            with gr.Tabs():
+                                with gr.Tab("Extracted Text"):
+                                    image_text_output_gemma = gr.Textbox(
+                                        lines=5,
+                                        label="Extracted Text",
+                                        show_copy_button=True,
+                                        rtl=True,
+                                        elem_classes="textbox1"
+                                    )
+                                with gr.Tab("Detected Text Regions"):
+                                    image_bbox_output_gemma = gr.Gallery(
+                                        label="Detected Text Regions",
+                                        show_label=True,
+                                        columns=2
+                                    )
+                with gr.Tab("PDF Input"):
+                    with gr.Row():
+                        with gr.Column(scale=2):
+                            pdf_input_gemma = gr.File(
+                                label="Input PDF",
+                                file_types=[".pdf"]
                             )
+                            pdf_submit_btn_gemma = gr.Button("Extract Text from PDF")
+                            # PDF examples
+                            gr.Examples(
+                                examples=[
+                                    ["example.pdf", "Example 1"],
+                                ],
+                                inputs=[pdf_input_gemma],
+                                label="Example PDFs",
+                                examples_per_page=8
                             )
+                        with gr.Column(scale=3):
+                            with gr.Tabs():
+                                with gr.Tab("Extracted Text"):
+                                    pdf_text_output_gemma = gr.Textbox(
+                                        lines=5,
+                                        label="Extracted Text",
+                                        show_copy_button=True,
+                                        rtl=True,
+                                        elem_classes="textbox1"
+                                    )
+                                with gr.Tab("Detected Text Regions"):
+                                    pdf_bbox_output_gemma = gr.Gallery(
+                                        label="Detected Text Regions",
+                                        show_label=True,
+                                        columns=2
+                                    )
+    # PaliGemma2 event handlers
+    image_submit_btn_paligemma.click(
+        fn=process_image_paligemma,
+        inputs=[model_dropdown_paligemma, image_input_paligemma],
+        outputs=[image_text_output_paligemma, image_bbox_output_paligemma]
+    )
+    pdf_submit_btn_paligemma.click(
+        fn=process_pdf_paligemma,
+        inputs=[pdf_input_paligemma, model_dropdown_paligemma],
+        outputs=[pdf_text_output_paligemma, pdf_bbox_output_paligemma]
     )
+    # Gemma event handlers
+    image_submit_btn_gemma.click(
+        fn=process_image_gemma,
+        inputs=[model_dropdown_gemma, image_input_gemma],
+        outputs=[image_text_output_gemma, image_bbox_output_gemma]
     )
+    pdf_submit_btn_gemma.click(
+        fn=process_pdf_gemma,
+        inputs=[pdf_input_gemma, model_dropdown_gemma],
+        outputs=[pdf_text_output_gemma, pdf_bbox_output_gemma]
+    )
 # Function to install requirements
 def install_requirements():
 # Launch the app
 if __name__ == "__main__":
+    # First install requirements
     success = install_requirements()
     if success:
         print("All requirements installed successfully")
         from transformers import PaliGemmaForConditionalGeneration, AutoProcessor
         from peft import PeftModel, PeftConfig
+        # Load the first PaliGemma2 model by default
+        #paligemma_handler.load_model(list(PALIGEMMA_MODELS.keys())[0])
         #demo.launch(server_name="0.0.0.0", server_port=7812)
         demo.launch()

gemma.py ADDED Viewed

	@@ -0,0 +1,304 @@

+import torch
+from PIL import Image, ImageDraw
+from transformers import AutoProcessor, AutoModelForImageTextToText
+from peft import PeftModel, PeftConfig
+import numpy as np
+from detector import TextDetector
+import tempfile
+import os
+# List of available models with their IDs and prompts
+MODELS = {
+    "Gemma-3 10k": {
+        "id": "alakxender/dhivehi-image-text-init10k-gemma",
+        "prompt": "Extract the dhivehi text from the image"
+    }
+}
+class GemmaHandler:
+    def __init__(self):
+        self.model = None
+        self.processor = None
+        self.current_model_name = None
+        self.detector = TextDetector()
+    def load_model(self, model_name):
+        """Load the model and processor"""
+        model_id = MODELS[model_name]['id']
+        # Load the model and processor
+        self.model = AutoModelForImageTextToText.from_pretrained(
+            model_id,
+            device_map="auto",
+            torch_dtype=torch.bfloat16
+        )
+        self.processor = AutoProcessor.from_pretrained(model_id)
+        self.current_model_name = model_name
+    def process_image(self, model_name, image, progress=None):
+        """Process a single image"""
+        if image is None:
+            return "", []
+        # Load model if different model selected
+        if model_name != self.current_model_name:
+            try:
+                if progress is not None:
+                    progress(0, desc="Loading model...")
+            except:
+                pass
+            self.load_model(model_name)
+        if isinstance(image, np.ndarray):
+            image = Image.fromarray(image)
+        width, height = image.size
+        print(f"Image dimensions: {width}x{height}")
+        # Check if image proportions are similar to a single line
+        # Typical single line has width significantly larger than height
+        # and aspect ratio (width/height) greater than 3
+        aspect_ratio = width / height
+        if height <= 50 or aspect_ratio > 3:
+            try:
+                if progress is not None:
+                    progress(0.5, desc="Processing single line...")
+            except:
+                pass
+            result = self.process_single_line(image, model_name)
+            try:
+                if progress is not None:
+                    progress(1.0, desc="Done!")
+            except:
+                pass
+            return result, [image]
+        else:
+            return self.process_multi_line(image, model_name, progress)
+    def process_single_line(self, image, model_name):
+        """Process a single line of text"""
+        # Prepare the conversation format with instruction
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": MODELS[model_name]["prompt"]},
+                    {"type": "image", "image": image.convert("RGB")}
+                ],
+            }
+        ]
+        # Apply the chat template
+        prompt = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        # Process into tensors
+        inputs = self.processor(
+            text=prompt,
+            images=[image],
+            return_tensors="pt"
+        ).to(self.model.device)
+        # Generate text output
+        with torch.no_grad():
+            outputs = self.model.generate(**inputs, max_new_tokens=128)
+        decoded = self.processor.tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+        # Cleanup: remove any extra prefixes or instruction leakage
+        for unwanted in ["user", "model", "Instruction:", MODELS[model_name]["prompt"]]:
+            decoded = decoded.replace(unwanted, "")
+        return decoded.strip()
+    def process_multi_line(self, image, model_name, progress=None):
+        """Process a multi-line image by detecting text regions and OCRing each region"""
+        # Create temporary directory
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Save input image
+            input_path = os.path.join(temp_dir, "input.png")
+            image.save(input_path)
+            # Initialize detector with temp directory
+            detector = TextDetector(output_dir=temp_dir)
+            # Run text detection
+            try:
+                if progress is not None:
+                    progress(0.1, desc="Detecting text regions...")
+            except:
+                pass
+            results = detector.process_input(input_path, save_images=True)
+            # Get text regions for the image
+            regions = detector.get_text_regions(results, "input")
+            if not regions:
+                return "No text regions detected", []
+            # Process each text region
+            page_regions = regions[0]  # First page
+            text_lines = page_regions.get('bboxes', [])
+            if not text_lines:
+                return "No text lines detected", []
+            # Sort text lines by y-coordinate (top to bottom)
+            text_lines.sort(key=lambda x: x['bbox'][1])
+            # Draw bounding boxes on the image
+            bbox_image = image.copy()
+            bbox_image = self.draw_bboxes(bbox_image, text_lines)
+            # Process each text line
+            all_text = []
+            total_lines = len(text_lines)
+            for i, line in enumerate(text_lines):
+                try:
+                    if progress is not None:
+                        progress((i + 1) / total_lines, desc=f"Processing line {i+1}/{total_lines}")
+                except:
+                    pass
+                # Extract text region using bbox
+                x1, y1, x2, y2 = line['bbox']
+                line_image = image.crop((x1, y1, x2, y2))
+                # Process the line
+                line_text = self.process_single_line(line_image, model_name)
+                all_text.append(line_text)
+            try:
+                if progress is not None:
+                    progress(1.0, desc="Done!")
+            except:
+                pass
+            return "\n".join(all_text), [bbox_image]  # Return as list for gallery
+    def process_pdf(self, pdf_path, model_name, progress=None):
+        """Process a PDF file"""
+        if pdf_path is None:
+            return "", []
+        # Load model if different model selected
+        if model_name != self.current_model_name:
+            try:
+                if progress is not None:
+                    progress(0, desc="Loading model...")
+            except:
+                pass
+            self.load_model(model_name)
+        # Create temporary directory
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Initialize detector with temp directory
+            self.detector.output_dir = temp_dir
+            # Run text detection on PDF (process first 2 pages)
+            try:
+                if progress is not None:
+                    progress(0.1, desc="Detecting text regions in PDF...")
+            except:
+                pass
+            results = self.detector.process_input(pdf_path, save_images=True, page_range="0")
+            # Get text regions for the PDF
+            regions = self.detector.get_text_regions(results, os.path.splitext(os.path.basename(pdf_path))[0])
+            if not regions:
+                return "No text regions detected", []
+            # Process each page
+            all_text = []
+            bbox_images = []
+            # Get the base name of the PDF without extension
+            pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
+            for page_num, page_regions in enumerate(regions):
+                try:
+                    if progress is not None:
+                        progress(0.2 + (page_num/len(regions))*0.3, desc=f"Processing page {page_num+1}/{len(regions)}...")
+                except:
+                    pass
+                # Try different possible paths for the page image
+                possible_paths = [
+                    os.path.join(temp_dir, pdf_name, f"{pdf_name}_{page_num}_bbox.png"),  # Detector's actual path
+                    os.path.join(temp_dir, pdf_name, f"page_{page_num}.png"),  # Original path
+                    os.path.join(temp_dir, f"page_{page_num}.png"),  # Direct in output dir
+                    os.path.join(temp_dir, f"{pdf_name}_page_{page_num}.png")  # Alternative naming
+                ]
+                page_image = None
+                for page_image_path in possible_paths:
+                    if os.path.exists(page_image_path):
+                        page_image = Image.open(page_image_path)
+                        break
+                if page_image is None:
+                    all_text.append(f"\nPage {page_num+1}: Page image not found. Tried paths:\n" +
+                                  "\n".join(f"- {path}" for path in possible_paths))
+                    continue
+                text_lines = page_regions.get('bboxes', [])
+                if not text_lines:
+                    all_text.append(f"\nPage {page_num+1}: No text lines detected")
+                    continue
+                # Sort text lines by y-coordinate (top to bottom)
+                text_lines.sort(key=lambda x: x['bbox'][1])
+                # Draw bounding boxes on the image
+                bbox_image = page_image.copy()
+                bbox_image = self.draw_bboxes(bbox_image, text_lines)
+                bbox_images.append(bbox_image)
+                # Process each text line
+                page_text = []
+                total_lines = len(text_lines)
+                for i, line in enumerate(text_lines):
+                    try:
+                        if progress is not None:
+                            progress(0.5 + (page_num/len(regions))*0.2 + (i/total_lines)*0.3,
+                                    desc=f"Processing line {i+1}/{total_lines} on page {page_num+1}/{len(regions)}...")
+                    except:
+                        pass
+                    # Extract text region using bbox
+                    x1, y1, x2, y2 = line['bbox']
+                    line_image = page_image.crop((x1, y1, x2, y2))
+                    # Process the line
+                    line_text = self.process_single_line(line_image, model_name)
+                    page_text.append(line_text)
+                # Add page text without page number
+                all_text.extend(page_text)
+            try:
+                if progress is not None:
+                    progress(1.0, desc="Done!")
+            except:
+                pass
+            return "\n".join(all_text), bbox_images  # Return list of bbox images
+    @staticmethod
+    def draw_bboxes(image, text_lines):
+        """Draw bounding boxes on the image"""
+        draw = ImageDraw.Draw(image)
+        for line in text_lines:
+            # Draw polygon - flatten nested coordinates
+            polygon = line['polygon']
+            flat_polygon = [coord for point in polygon for coord in point]
+            draw.polygon(flat_polygon, outline="red", width=2)
+            # Draw bbox
+            x1, y1, x2, y2 = line['bbox']
+            draw.rectangle([x1, y1, x2, y2], outline="blue", width=1)
+            # Draw confidence score
+            draw.text((x1, y1 - 10), f"{line['confidence']:.2f}", fill="red")
+        return image

paligemma2.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import torch
+from PIL import Image, ImageDraw
+from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
+from peft import PeftModel, PeftConfig
+import numpy as np
+from detector import TextDetector
+import tempfile
+import os
+# List of available models with their IDs and prompts
+MODELS = {
+    "Medium-14k, Single Line": {
+        "id": "alakxender/paligemma2-qlora-dhivehi-ocr-224-sl-14k",
+        "prompt": "What text is written in this image?"
+    },
+    "Medium-16k, Single Line": {
+        "id": "alakxender/paligemma2-qlora-dhivehi-ocr-224-sl-md-16k",
+        "prompt": "What text is written in this image?"
+    },
+    "Small, Single Line": {
+        "id": "alakxender/paligemma2-qlora-vrd-dhivehi-ocr-224-sm",
+        "prompt": "What text is written in this image?"
+    }
+}
+class PaliGemma2Handler:
+    def __init__(self):
+        self.model = None
+        self.processor = None
+        self.current_model_name = None
+        self.detector = TextDetector()
+    def load_model(self, model_name):
+        """Load the model and processor"""
+        model_id = MODELS[model_name]['id']
+        # Load the PEFT configuration to get the base model path
+        peft_config = PeftConfig.from_pretrained(model_id)
+        # Load the base model
+        base_model = PaliGemmaForConditionalGeneration.from_pretrained(
+            peft_config.base_model_name_or_path,
+            device_map="auto",
+            torch_dtype=torch.bfloat16
+        )
+        # Load the adapter on top of the base model
+        self.model = PeftModel.from_pretrained(base_model, model_id)
+        self.processor = AutoProcessor.from_pretrained(peft_config.base_model_name_or_path)
+        self.current_model_name = model_name
+    def process_image(self, model_name, image, progress=None):
+        """Process a single image"""
+        if image is None:
+            return "", []
+        # Load model if different model selected
+        if model_name != self.current_model_name:
+            try:
+                if progress is not None:
+                    progress(0, desc="Loading model...")
+            except:
+                pass
+            self.load_model(model_name)
+        if isinstance(image, np.ndarray):
+            image = Image.fromarray(image)
+        width, height = image.size
+        print(f"Image dimensions: {width}x{height}")
+        # Check if image proportions are similar to a single line
+        # Typical single line has width significantly larger than height
+        # and aspect ratio (width/height) greater than 3
+        aspect_ratio = width / height
+        if height <= 50 or aspect_ratio > 3:
+            try:
+                if progress is not None:
+                    progress(0.5, desc="Processing single line...")
+            except:
+                pass
+            result = self.process_single_line(image, model_name)
+            try:
+                if progress is not None:
+                    progress(1.0, desc="Done!")
+            except:
+                pass
+            return result, [image]
+        else:
+            return self.process_multi_line(image, model_name, progress)
+    def process_single_line(self, image, model_name):
+        """Process a single line of text"""
+        prompt = MODELS[model_name]["prompt"]
+        # Add image token to prompt
+        prompt = f"<image>{prompt}"
+        # First prepare inputs without moving to CUDA
+        model_inputs = self.processor(text=prompt, images=image, return_tensors="pt")
+        # Then move to CUDA and convert only image tensors to bfloat16
+        for k, v in model_inputs.items():
+            if k == "pixel_values":
+                model_inputs[k] = v.to(torch.bfloat16).to("cuda")
+            else:
+                model_inputs[k] = v.to("cuda")
+        outputs = self.model.generate(
+            **model_inputs,
+            max_new_tokens=500,
+            do_sample=False
+        )
+        generated_text = self.processor.batch_decode(outputs, skip_special_tokens=True)[0]
+        # Remove the prompt and any leading/trailing whitespace
+        cleaned_text = generated_text.replace(prompt, "").strip()
+        # Remove any remaining question marks or other artifacts
+        cleaned_text = cleaned_text.lstrip("?").strip()
+        # Remove the prompt text if it somehow appears in the output
+        cleaned_text = cleaned_text.replace("What text is written in this image?", "").strip()
+        return cleaned_text
+    def process_multi_line(self, image, model_name, progress=None):
+        """Process a multi-line image by detecting text regions and OCRing each region"""
+        # Create temporary directory
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Save input image
+            input_path = os.path.join(temp_dir, "input.png")
+            image.save(input_path)
+            # Initialize detector with temp directory
+            detector = TextDetector(output_dir=temp_dir)
+            # Run text detection
+            try:
+                if progress is not None:
+                    progress(0.1, desc="Detecting text regions...")
+            except:
+                pass
+            results = detector.process_input(input_path, save_images=True)
+            # Get text regions for the image
+            regions = detector.get_text_regions(results, "input")
+            if not regions:
+                return "No text regions detected", []
+            # Process each text region
+            page_regions = regions[0]  # First page
+            text_lines = page_regions.get('bboxes', [])
+            if not text_lines:
+                return "No text lines detected", []
+            # Sort text lines by y-coordinate (top to bottom)
+            text_lines.sort(key=lambda x: x['bbox'][1])
+            # Draw bounding boxes on the image
+            bbox_image = image.copy()
+            bbox_image = self.draw_bboxes(bbox_image, text_lines)
+            # Process each text line
+            all_text = []
+            total_lines = len(text_lines)
+            for i, line in enumerate(text_lines):
+                try:
+                    if progress is not None:
+                        progress((i + 1) / total_lines, desc=f"Processing line {i+1}/{total_lines}")
+                except:
+                    pass
+                # Extract text region using bbox
+                x1, y1, x2, y2 = line['bbox']
+                line_image = image.crop((x1, y1, x2, y2))
+                # Process the line
+                line_text = self.process_single_line(line_image, model_name)
+                all_text.append(line_text)
+            try:
+                if progress is not None:
+                    progress(1.0, desc="Done!")
+            except:
+                pass
+            return "\n".join(all_text), [bbox_image]  # Return as list for gallery
+    def process_pdf(self, pdf_path, model_name, progress=None):
+        """Process a PDF file"""
+        if pdf_path is None:
+            return "", []
+        # Load model if different model selected
+        if model_name != self.current_model_name:
+            try:
+                if progress is not None:
+                    progress(0, desc="Loading model...")
+            except:
+                pass
+            self.load_model(model_name)
+        # Create temporary directory
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Initialize detector with temp directory
+            self.detector.output_dir = temp_dir
+            # Run text detection on PDF (process first 2 pages)
+            try:
+                if progress is not None:
+                    progress(0.1, desc="Detecting text regions in PDF...")
+            except:
+                pass
+            results = self.detector.process_input(pdf_path, save_images=True, page_range="0")
+            # Get text regions for the PDF
+            regions = self.detector.get_text_regions(results, os.path.splitext(os.path.basename(pdf_path))[0])
+            if not regions:
+                return "No text regions detected", []
+            # Process each page
+            all_text = []
+            bbox_images = []
+            # Get the base name of the PDF without extension
+            pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
+            for page_num, page_regions in enumerate(regions):
+                try:
+                    if progress is not None:
+                        progress(0.2 + (page_num/len(regions))*0.3, desc=f"Processing page {page_num+1}/{len(regions)}...")
+                except:
+                    pass
+                # Try different possible paths for the page image
+                possible_paths = [
+                    os.path.join(temp_dir, pdf_name, f"{pdf_name}_{page_num}_bbox.png"),  # Detector's actual path
+                    os.path.join(temp_dir, pdf_name, f"page_{page_num}.png"),  # Original path
+                    os.path.join(temp_dir, f"page_{page_num}.png"),  # Direct in output dir
+                    os.path.join(temp_dir, f"{pdf_name}_page_{page_num}.png")  # Alternative naming
+                ]
+                page_image = None
+                for page_image_path in possible_paths:
+                    if os.path.exists(page_image_path):
+                        page_image = Image.open(page_image_path)
+                        break
+                if page_image is None:
+                    all_text.append(f"\nPage {page_num+1}: Page image not found. Tried paths:\n" +
+                                  "\n".join(f"- {path}" for path in possible_paths))
+                    continue
+                text_lines = page_regions.get('bboxes', [])
+                if not text_lines:
+                    all_text.append(f"\nPage {page_num+1}: No text lines detected")
+                    continue
+                # Sort text lines by y-coordinate (top to bottom)
+                text_lines.sort(key=lambda x: x['bbox'][1])
+                # Draw bounding boxes on the image
+                bbox_image = page_image.copy()
+                bbox_image = self.draw_bboxes(bbox_image, text_lines)
+                bbox_images.append(bbox_image)
+                # Process each text line
+                page_text = []
+                total_lines = len(text_lines)
+                for i, line in enumerate(text_lines):
+                    try:
+                        if progress is not None:
+                            progress(0.5 + (page_num/len(regions))*0.2 + (i/total_lines)*0.3,
+                                    desc=f"Processing line {i+1}/{total_lines} on page {page_num+1}/{len(regions)}...")
+                    except:
+                        pass
+                    # Extract text region using bbox
+                    x1, y1, x2, y2 = line['bbox']
+                    line_image = page_image.crop((x1, y1, x2, y2))
+                    # Process the line
+                    line_text = self.process_single_line(line_image, model_name)
+                    page_text.append(line_text)
+                # Add page text without page number
+                all_text.extend(page_text)
+            try:
+                if progress is not None:
+                    progress(1.0, desc="Done!")
+            except:
+                pass
+            return "\n".join(all_text), bbox_images  # Return list of bbox images
+    @staticmethod
+    def draw_bboxes(image, text_lines):
+        """Draw bounding boxes on the image"""
+        draw = ImageDraw.Draw(image)
+        for line in text_lines:
+            # Draw polygon - flatten nested coordinates
+            polygon = line['polygon']
+            flat_polygon = [coord for point in polygon for coord in point]
+            draw.polygon(flat_polygon, outline="red", width=2)
+            # Draw bbox
+            x1, y1, x2, y2 = line['bbox']
+            draw.rectangle([x1, y1, x2, y2], outline="blue", width=1)
+            # Draw confidence score
+            draw.text((x1, y1 - 10), f"{line['confidence']:.2f}", fill="red")
+        return image