Spaces:

alakxender
/

dhivehi-ocr

Running on Zero

App Files Files Community

alakxender commited on May 4

Commit

228e8c1

0 Parent(s):

init

Browse files

Files changed (14) hide show

.gitattributes +36 -0
.gitignore +2 -0
README.md +13 -0
app.py +423 -0
detector.py +141 -0
example.pdf +3 -0
hw_1_sl.png +0 -0
hw_2_sl.jpg +0 -0
hw_3_sl.png +0 -0
hw_4_sl.png +0 -0
ml.png +0 -0
requirements.txt +3 -0
type_1_sl.png +0 -0
type_2_sl.png +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__
2	+ output

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Dhivehi Ocr
+emoji: 📝
+colorFrom: gray
+colorTo: yellow
+sdk: gradio
+sdk_version: 5.25.2
+app_file: app.py
+pinned: false
+short_description: Thaana text-to-image, ocr
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,423 @@

+import spaces
+import gradio as gr
+import os
+import sys
+import subprocess
+from PIL import Image, ImageDraw
+from detector import TextDetector
+import tempfile
+import shutil
+import json
+from datetime import datetime
+# List of available models with their IDs and prompts
+MODELS = {
+    "Medium-14k, Single Line": { # /lab/mx01/md/sl-14/ft/
+        "id": "alakxender/paligemma2-qlora-dhivehi-ocr-224-sl-14k",
+        "prompt": "What text is written in this image?"
+    },
+    "Medium-16k, Single Line": { # /lab/mx01/md/sl-16/ft/
+        "id": "alakxender/paligemma2-qlora-dhivehi-ocr-224-sl-md-16k",
+        "prompt": "What text is written in this image?"
+    },
+    "Small, Single Line": { # /lab/mx01/sm/sl/ft/
+        "id": "alakxender/paligemma2-qlora-vrd-dhivehi-ocr-224-sm",
+        "prompt": "What text is written in this image?"
+    }
+}
+""" "Full Text": { # /lab/mx01/pr/sl/ft/
+        "id": "alakxender/paligemma2-qlora-dhivehi-ocr-224-mx01",
+        "prompt": "What text is written in this image?",
+    } ,
+    Full Text": { # /lab/mx01/pr/sl/ft/
+        "id": "alakxender/paligemma2-qlora-dhivehi-ocr-448-mx01",
+        "prompt": "OCR",
+    }
+    ,
+    Final": { # /lab/mx01/pr/sl/ft-final/
+        "id": "alakxender/paligemma2-dhivehi-ocr-448-mx01-final",
+        "prompt": "OCR", # smaller the better: 3k vrd, 3k printed, 3k handwritten, 1k single line
+    }"""
+# Global model state
+model = None
+processor = None
+current_model_name = None
+detector = TextDetector()
+def load_model(model_name):
+    """Load the model and processor"""
+    global model, processor, current_model_name
+    model_id = MODELS[model_name]['id']
+    # Load the PEFT configuration to get the base model path
+    peft_config = PeftConfig.from_pretrained(model_id)
+    # Load the base model
+    base_model = PaliGemmaForConditionalGeneration.from_pretrained(
+        peft_config.base_model_name_or_path,
+        device_map="auto",
+        torch_dtype=torch.bfloat16
+    )
+    # Load the adapter on top of the base model
+    model = PeftModel.from_pretrained(base_model, model_id)
+    processor = AutoProcessor.from_pretrained(peft_config.base_model_name_or_path)
+    current_model_name = model_name
+def process_single_line(image, model_name):
+    """Process a single line of text"""
+    prompt = MODELS[model_name]["prompt"]
+    # Add image token to prompt
+    prompt = f"<image>{prompt}"
+    model_inputs = processor(text=prompt, images=image, return_tensors="pt").to(torch.bfloat16).to("cuda")
+    outputs = model.generate(
+        **model_inputs,
+        max_new_tokens=500,
+        do_sample=False
+    )
+    generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+    # Remove the prompt and any leading/trailing whitespace
+    cleaned_text = generated_text.replace(prompt, "").strip()
+    # Remove any remaining question marks or other artifacts
+    cleaned_text = cleaned_text.lstrip("?").strip()
+    # Remove the prompt text if it somehow appears in the output
+    cleaned_text = cleaned_text.replace("What text is written in this image?", "").strip()
+    return cleaned_text
+def draw_bboxes(image, text_lines):
+    """Draw bounding boxes on the image"""
+    draw = ImageDraw.Draw(image)
+    for line in text_lines:
+        # Draw polygon - flatten nested coordinates
+        polygon = line['polygon']
+        flat_polygon = [coord for point in polygon for coord in point]
+        draw.polygon(flat_polygon, outline="red", width=2)
+        # Draw bbox
+        x1, y1, x2, y2 = line['bbox']
+        draw.rectangle([x1, y1, x2, y2], outline="blue", width=1)
+        # Draw confidence score
+        draw.text((x1, y1 - 10), f"{line['confidence']:.2f}", fill="red")
+    return image
+def process_multi_line(image, model_name, progress=gr.Progress()):
+    """Process a multi-line image by detecting text regions and OCRing each region"""
+    # Create temporary directory
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Save input image
+        input_path = os.path.join(temp_dir, "input.png")
+        image.save(input_path)
+        # Initialize detector with temp directory
+        detector = TextDetector(output_dir=temp_dir)
+        # Run text detection
+        progress(0.1, desc="Detecting text regions...")
+        results = detector.process_input(input_path, save_images=True)
+        # Get text regions for the image
+        regions = detector.get_text_regions(results, "input")
+        if not regions:
+            return "No text regions detected", []
+        # Process each text region
+        page_regions = regions[0]  # First page
+        text_lines = page_regions.get('bboxes', [])
+        if not text_lines:
+            return "No text lines detected", []
+        # Sort text lines by y-coordinate (top to bottom)
+        text_lines.sort(key=lambda x: x['bbox'][1])
+        # Draw bounding boxes on the image
+        bbox_image = image.copy()
+        bbox_image = draw_bboxes(bbox_image, text_lines)
+        # Process each text line
+        all_text = []
+        total_lines = len(text_lines)
+        for i, line in enumerate(text_lines):
+            progress(0.2 + (i/total_lines)*0.8, desc=f"Processing line {i+1}/{total_lines}...")
+            # Extract text region using bbox
+            x1, y1, x2, y2 = line['bbox']
+            line_image = image.crop((x1, y1, x2, y2))
+            # Process the line
+            line_text = process_single_line(line_image, model_name)
+            all_text.append(line_text)
+        progress(1.0, desc="Done!")
+        return "\n".join(all_text), [bbox_image]  # Return as list for gallery
+def process_pdf(pdf_path, model_name, progress=gr.Progress()):
+    """Process a PDF file"""
+    # Create temporary directory
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Initialize detector with temp directory
+        detector = TextDetector(output_dir=temp_dir)
+        # Run text detection on PDF (process first 2 pages)
+        progress(0.1, desc="Detecting text regions in PDF...")
+        results = detector.process_input(pdf_path, save_images=True, page_range="0,1")
+        # Get text regions for the PDF
+        regions = detector.get_text_regions(results, os.path.splitext(os.path.basename(pdf_path))[0])
+        if not regions:
+            return "No text regions detected", []
+        # Process each page
+        all_text = []
+        bbox_images = []
+        # Get the base name of the PDF without extension
+        pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
+        for page_num, page_regions in enumerate(regions):
+            progress(0.2 + (page_num/2)*0.3, desc=f"Processing page {page_num+1}...")
+            # Try different possible paths for the page image
+            possible_paths = [
+                os.path.join(temp_dir, pdf_name, f"{pdf_name}_{page_num}_bbox.png"),  # Detector's actual path
+                os.path.join(temp_dir, pdf_name, f"page_{page_num}.png"),  # Original path
+                os.path.join(temp_dir, f"page_{page_num}.png"),  # Direct in output dir
+                os.path.join(temp_dir, f"{pdf_name}_page_{page_num}.png")  # Alternative naming
+            ]
+            page_image = None
+            for page_image_path in possible_paths:
+                if os.path.exists(page_image_path):
+                    page_image = Image.open(page_image_path)
+                    break
+            if page_image is None:
+                all_text.append(f"\nPage {page_num+1}: Page image not found. Tried paths:\n" +
+                              "\n".join(f"- {path}" for path in possible_paths))
+                continue
+            text_lines = page_regions.get('bboxes', [])
+            if not text_lines:
+                all_text.append(f"\nPage {page_num+1}: No text lines detected")
+                continue
+            # Sort text lines by y-coordinate (top to bottom)
+            text_lines.sort(key=lambda x: x['bbox'][1])
+            # Draw bounding boxes on the image
+            bbox_image = page_image.copy()
+            bbox_image = draw_bboxes(bbox_image, text_lines)
+            bbox_images.append(bbox_image)
+            # Process each text line
+            page_text = []
+            total_lines = len(text_lines)
+            for i, line in enumerate(text_lines):
+                progress(0.5 + (page_num/2)*0.2 + (i/total_lines)*0.3,
+                        desc=f"Processing line {i+1}/{total_lines} on page {page_num+1}...")
+                # Extract text region using bbox
+                x1, y1, x2, y2 = line['bbox']
+                line_image = page_image.crop((x1, y1, x2, y2))
+                # Process the line
+                line_text = process_single_line(line_image, model_name)
+                page_text.append(line_text)
+            # Add page text without page number
+            all_text.extend(page_text)
+        progress(1.0, desc="Done!")
+        return "\n".join(all_text), bbox_images  # Return list of bbox images
+@spaces.GPU
+def process_image(model_name, image, progress=gr.Progress()):
+    """Process a single image"""
+    if image is None:
+        return "", None
+    # Load model if different model selected
+    if model_name != current_model_name:
+        progress(0, desc="Loading model...")
+        load_model(model_name)
+    return process_multi_line(image, model_name, progress)
+# Example images with descriptions
+examples = [
+    ["type_1_sl.png", "Typed Dhivehi text sample 1"],
+    ["type_2_sl.png", "Typed Dhivehi text sample 2"],
+    ["hw_1_sl.png", "Handwritten Dhivehi text sample 1"], # exp this
+    ["hw_2_sl.jpg", "Handwritten Dhivehi text sample 2"],  # exp val3
+    ["hw_3_sl.png", "Handwritten Dhivehi text sample 3"],  # exp val2
+    ["hw_4_sl.png", "Handwritten Dhivehi text sample 4"], # exp val1
+    ["ml.png", "Multi-line Dhivehi text sample"]
+]
+css = """
+.textbox1 textarea {
+    font-size: 18px !important;
+    font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma' !important;
+    line-height: 1.8 !important;
+}
+.textbox2 textarea {
+    display: none;
+}
+"""
+with gr.Blocks(title="Dhivehi OCR",css=css) as demo:
+    gr.Markdown("# Dhivehi OCR")
+    gr.Markdown("Thaana OCR experimental finetunes")
+    with gr.Row():
+        model_dropdown = gr.Dropdown(
+            choices=list(MODELS.keys()),
+            value=list(MODELS.keys())[0], # Default to first model
+            label="Select Model"
+        )
+    with gr.Tabs():
+        with gr.Tab("Image Input"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    image_input = gr.Image(type="pil", label="Input Image")
+                    image_submit_btn = gr.Button("Extract Text")
+                    # Image examples
+                    gr.Examples(
+                        examples=[[img] for img, _ in examples],
+                        inputs=[image_input],
+                        label="Example Images",
+                        examples_per_page=8
+                    )
+                with gr.Column(scale=3):
+                    with gr.Tabs():
+                        with gr.Tab("Extracted Text"):
+                            image_text_output = gr.Textbox(
+                                lines=5,
+                                label="Extracted Text",
+                                show_copy_button=True,
+                                rtl=True,
+                                elem_classes="textbox1"
+                            )
+                        with gr.Tab("Detected Text Regions"):
+                            image_bbox_output = gr.Gallery(
+                                label="Detected Text Regions",
+                                show_label=True,
+                                columns=2
+                            )
+        with gr.Tab("PDF Input"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    pdf_input = gr.File(
+                        label="Input PDF",
+                        file_types=[".pdf"]
+                    )
+                    pdf_submit_btn = gr.Button("Extract Text")
+                    # PDF examples
+                    gr.Examples(
+                        examples=[
+                            ["example.pdf", "Example 1"],
+                        ],  # Add PDF examples here if needed
+                        inputs=[pdf_input],
+                        label="Example PDFs",
+                        examples_per_page=8
+                    )
+                with gr.Column(scale=3):
+                    with gr.Tabs():
+                        with gr.Tab("Extracted Text"):
+                            pdf_text_output = gr.Textbox(
+                                lines=5,
+                                label="Extracted Text",
+                                show_copy_button=True,
+                                rtl=True,
+                                elem_classes="textbox1"
+                            )
+                        with gr.Tab("Detected Text Regions"):
+                            pdf_bbox_output = gr.Gallery(
+                                label="Detected Text Regions",
+                                show_label=True,
+                                columns=2
+                            )
+    # Process image when button is clicked
+    image_submit_btn.click(
+        fn=process_image,
+        inputs=[model_dropdown, image_input],
+        outputs=[image_text_output, image_bbox_output]
+    )
+    # Process PDF when button is clicked
+    pdf_submit_btn.click(
+        fn=process_pdf,
+        inputs=[pdf_input, model_dropdown],
+        outputs=[pdf_text_output, pdf_bbox_output]
+    )
+    # Add experimental note at the bottom
+    gr.Markdown("""
+    ---
+    **Note:** This is an experimental proof of concept (POC) for Dhivehi OCR.
+    """)
+# Function to install requirements
+def install_requirements():
+    requirements_path = 'requirements.txt'
+    # Check if requirements.txt exists
+    if not os.path.exists(requirements_path):
+        print("Error: requirements.txt not found")
+        return False
+    try:
+        print("Installing requirements...")
+        # Using --no-cache-dir to avoid memory issues
+        subprocess.check_call([
+            sys.executable,
+            "-m",
+            "pip",
+            "install",
+            "-r",
+            requirements_path,
+            "--no-cache-dir"
+        ])
+        print("Successfully installed all requirements")
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"Error installing requirements: {e}")
+        return False
+    except Exception as e:
+        print(f"Unexpected error: {e}")
+        return False
+# Launch the app
+if __name__ == "__main__":
+     # First install requirements
+    success = install_requirements()
+    if success:
+        print("All requirements installed successfully")
+        from transformers.image_utils import load_image
+        import torch
+        from transformers import PaliGemmaForConditionalGeneration, AutoProcessor
+        from peft import PeftModel, PeftConfig
+        # Load the first model by default
+        load_model(list(MODELS.keys())[0])
+        demo.launch(server_name="0.0.0.0", server_port=7812)
+        #demo.launch()
+    else:
+        print("Failed to install some requirements")

detector.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import os
+import json
+import subprocess
+from typing import Union, List, Dict, Optional
+from pathlib import Path
+class TextDetector:
+    def __init__(self, output_dir: Optional[str] = None):
+        """
+        Initialize the text detector.
+        Args:
+            output_dir: Optional directory to save results. If None, uses default surya_detect output directory.
+        """
+        self.output_dir = output_dir
+    def process_input(self,
+                     data_path: Union[str, Path],
+                     save_images: bool = False,
+                     page_range: Optional[str] = None) -> Dict:
+        """
+        Process input file or directory using surya_detect.
+        Args:
+            data_path: Path to image, PDF, or directory of images/PDFs
+            save_images: Whether to save images of pages and detected text lines
+            page_range: Optional page range to process in PDFs (e.g., "0,5-10,20")
+        Returns:
+            Dictionary containing detection results
+        """
+        # Convert to Path object if string
+        data_path = Path(data_path)
+        # Build surya_detect command
+        cmd = ["surya_detect", str(data_path)]
+        if save_images:
+            cmd.append("--images")
+        if self.output_dir:
+            cmd.extend(["--output_dir", self.output_dir])
+        if page_range:
+            cmd.extend(["--page_range", page_range])
+        # Run surya_detect
+        try:
+            subprocess.run(cmd, check=True)
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f"Error running surya_detect: {e}")
+        # Read and return results
+        return self._read_results(data_path)
+    def _read_results(self, data_path: Path) -> Dict:
+        """
+        Read and parse the results.json file generated by surya_detect.
+        Args:
+            data_path: Path to the input file/directory
+        Returns:
+            Dictionary containing detection results
+        """
+        # Determine results file path
+        if self.output_dir:
+            # surya_detect creates a subdirectory with the input filename
+            input_name = data_path.stem
+            results_path = Path(self.output_dir) / input_name / "results.json"
+        else:
+            # Default surya_detect output location
+            results_path = data_path.parent / "results.json"
+        if not results_path.exists():
+            raise FileNotFoundError(f"Results file not found at {results_path}")
+        # Read and parse results
+        with open(results_path, 'r') as f:
+            results = json.load(f)
+        return results
+    def get_text_regions(self, results: Dict, filename: str) -> List[Dict]:
+        """
+        Extract text regions from detection results for a specific file.
+        Args:
+            results: Detection results dictionary
+            filename: Name of the file to get regions for (without extension)
+        Returns:
+            List of dictionaries containing text regions for each page
+        """
+        if filename not in results:
+            raise KeyError(f"No results found for file {filename}")
+        return results[filename]
+    def get_page_regions(self, results: Dict, filename: str, page_num: int) -> Dict:
+        """
+        Get text regions for a specific page of a file.
+        Args:
+            results: Detection results dictionary
+            filename: Name of the file (without extension)
+            page_num: Page number (0-based)
+        Returns:
+            Dictionary containing text regions for the specified page
+        """
+        regions = self.get_text_regions(results, filename)
+        if page_num >= len(regions):
+            raise IndexError(f"Page {page_num} not found in results")
+        return regions[page_num]
+    def get_text_lines(self, page_regions: Dict) -> List[Dict]:
+        """
+        Extract text lines from page regions.
+        Args:
+            page_regions: Dictionary containing page detection results
+        Returns:
+            List of dictionaries containing text line information
+        """
+        return page_regions.get('bboxes', [])
+    def get_vertical_lines(self, page_regions: Dict) -> List[Dict]:
+        """
+        Extract vertical lines from page regions.
+        Args:
+            page_regions: Dictionary containing page detection results
+        Returns:
+            List of dictionaries containing vertical line information
+        """
+        return page_regions.get('vertical_lines', [])