Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import json | |
import subprocess | |
from typing import Union, List, Dict, Optional | |
from pathlib import Path | |
class TextDetector: | |
def __init__(self, output_dir: Optional[str] = None): | |
""" | |
Initialize the text detector. | |
Args: | |
output_dir: Optional directory to save results. If None, uses default surya_detect output directory. | |
""" | |
self.output_dir = output_dir | |
def process_input(self, | |
data_path: Union[str, Path], | |
save_images: bool = False, | |
page_range: Optional[str] = None) -> Dict: | |
""" | |
Process input file or directory using surya_detect. | |
Args: | |
data_path: Path to image, PDF, or directory of images/PDFs | |
save_images: Whether to save images of pages and detected text lines | |
page_range: Optional page range to process in PDFs (e.g., "0,5-10,20") | |
Returns: | |
Dictionary containing detection results | |
""" | |
# Convert to Path object if string | |
data_path = Path(data_path) | |
# Build surya_detect command | |
cmd = ["surya_detect", str(data_path)] | |
if save_images: | |
cmd.append("--images") | |
if self.output_dir: | |
cmd.extend(["--output_dir", self.output_dir]) | |
if page_range: | |
cmd.extend(["--page_range", page_range]) | |
# Run surya_detect | |
try: | |
subprocess.run(cmd, check=True) | |
except subprocess.CalledProcessError as e: | |
raise RuntimeError(f"Error running surya_detect: {e}") | |
# Read and return results | |
return self._read_results(data_path) | |
def _read_results(self, data_path: Path) -> Dict: | |
""" | |
Read and parse the results.json file generated by surya_detect. | |
Args: | |
data_path: Path to the input file/directory | |
Returns: | |
Dictionary containing detection results | |
""" | |
# Determine results file path | |
if self.output_dir: | |
# surya_detect creates a subdirectory with the input filename | |
input_name = data_path.stem | |
results_path = Path(self.output_dir) / input_name / "results.json" | |
else: | |
# Default surya_detect output location | |
results_path = data_path.parent / "results.json" | |
if not results_path.exists(): | |
raise FileNotFoundError(f"Results file not found at {results_path}") | |
# Read and parse results | |
with open(results_path, 'r') as f: | |
results = json.load(f) | |
return results | |
def get_text_regions(self, results: Dict, filename: str) -> List[Dict]: | |
""" | |
Extract text regions from detection results for a specific file. | |
Args: | |
results: Detection results dictionary | |
filename: Name of the file to get regions for (without extension) | |
Returns: | |
List of dictionaries containing text regions for each page | |
""" | |
if filename not in results: | |
raise KeyError(f"No results found for file {filename}") | |
return results[filename] | |
def get_page_regions(self, results: Dict, filename: str, page_num: int) -> Dict: | |
""" | |
Get text regions for a specific page of a file. | |
Args: | |
results: Detection results dictionary | |
filename: Name of the file (without extension) | |
page_num: Page number (0-based) | |
Returns: | |
Dictionary containing text regions for the specified page | |
""" | |
regions = self.get_text_regions(results, filename) | |
if page_num >= len(regions): | |
raise IndexError(f"Page {page_num} not found in results") | |
return regions[page_num] | |
def get_text_lines(self, page_regions: Dict) -> List[Dict]: | |
""" | |
Extract text lines from page regions. | |
Args: | |
page_regions: Dictionary containing page detection results | |
Returns: | |
List of dictionaries containing text line information | |
""" | |
return page_regions.get('bboxes', []) | |
def get_vertical_lines(self, page_regions: Dict) -> List[Dict]: | |
""" | |
Extract vertical lines from page regions. | |
Args: | |
page_regions: Dictionary containing page detection results | |
Returns: | |
List of dictionaries containing vertical line information | |
""" | |
return page_regions.get('vertical_lines', []) |