|
import logging |
|
import json |
|
import yaml |
|
import gradio as gr |
|
import gradio.themes as themes |
|
from pathlib import Path |
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend |
|
from docling.datamodel.base_models import InputFormat |
|
from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption |
|
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, EasyOcrOptions |
|
from docling.utils.export import generate_multimodal_pages |
|
from docling.utils.utils import create_hash |
|
import pandas as pd |
|
import time |
|
import datetime |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
_log = logging.getLogger(__name__) |
|
|
|
|
|
ocr_options = EasyOcrOptions(force_full_page_ocr=True) |
|
pipeline_options = PdfPipelineOptions(do_table_structure=True) |
|
pipeline_options.do_ocr = True |
|
pipeline_options.table_structure_options.do_cell_matching = True |
|
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE |
|
pipeline_options.ocr_options = ocr_options |
|
pipeline_options.ocr_options.lang = ["id", "en"] |
|
|
|
|
|
def export_tables_and_figures(conv_res, output_dir): |
|
"""Exports tables, figures, and multimodal pages from the converted document.""" |
|
start_time = time.time() |
|
|
|
output_files = [] |
|
|
|
|
|
for table_ix, table in enumerate(conv_res.document.tables): |
|
table_df = table.export_to_dataframe() |
|
table_csv_filename = output_dir / f"{conv_res.input.file.stem}-table-{table_ix + 1}.csv" |
|
table_html_filename = output_dir / f"{conv_res.input.file.stem}-table-{table_ix + 1}.html" |
|
|
|
_log.info(f"Saving CSV table to {table_csv_filename}") |
|
table_df.to_csv(table_csv_filename) |
|
|
|
_log.info(f"Saving HTML table to {table_html_filename}") |
|
with table_html_filename.open("w") as fp: |
|
fp.write(table.export_to_html()) |
|
|
|
|
|
output_files.append(table_csv_filename) |
|
output_files.append(table_html_filename) |
|
|
|
|
|
for picture_ix, picture in enumerate(conv_res.document.pictures): |
|
if picture.image: |
|
picture_image_filename = output_dir / f"{conv_res.input.file.stem}-picture-{picture_ix + 1}.png" |
|
_log.info(f"Saving Picture to {picture_image_filename}") |
|
picture.image.save(picture_image_filename) |
|
|
|
|
|
output_files.append(picture_image_filename) |
|
else: |
|
_log.warning(f"Skipping picture {picture_ix + 1} due to missing image.") |
|
|
|
|
|
rows = [] |
|
for content_text, content_md, content_dt, page_cells, page_segments, page in generate_multimodal_pages(conv_res): |
|
try: |
|
dpi = page._default_image_scale * 72 |
|
|
|
image_width = image_height = 0 |
|
image_bytes = None |
|
if page.image: |
|
image_width = page.image.width |
|
image_height = page.image.height |
|
image_bytes = page.image.tobytes() |
|
|
|
rows.append({ |
|
"document": conv_res.input.file.name, |
|
"hash": conv_res.input.document_hash, |
|
"page_hash": create_hash(conv_res.input.document_hash + ":" + str(page.page_no - 1)), |
|
"image": { |
|
"width": image_width, |
|
"height": image_height, |
|
"bytes": image_bytes, |
|
}, |
|
"cells": page_cells, |
|
"contents": content_text, |
|
"contents_md": content_md, |
|
"contents_dt": content_dt, |
|
"segments": page_segments, |
|
"extra": { |
|
"page_num": page.page_no + 1, |
|
"width_in_points": page.size.width, |
|
"height_in_points": page.size.height, |
|
"dpi": dpi, |
|
}, |
|
}) |
|
except Exception as e: |
|
_log.warning(f"Failed to process page {page.page_no + 1}: {e}") |
|
|
|
|
|
df = pd.json_normalize(rows) |
|
now = datetime.datetime.now() |
|
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet" |
|
df.to_parquet(output_filename) |
|
|
|
|
|
output_files.append(output_filename) |
|
|
|
end_time = time.time() - start_time |
|
_log.info(f"Tables, figures, and multimodal pages exported in {end_time:.2f} seconds.") |
|
|
|
return [str(file.resolve()) for file in output_files] |
|
|
|
|
|
def convert_document(input_file): |
|
|
|
output_dir = Path("scratch") |
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
doc_converter = DocumentConverter( |
|
allowed_formats=[InputFormat.PDF, InputFormat.IMAGE, InputFormat.DOCX, InputFormat.HTML], |
|
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend)} |
|
) |
|
|
|
|
|
input_path = Path(input_file.name) |
|
conv_results = doc_converter.convert_all([input_path]) |
|
|
|
|
|
output_files = [] |
|
for res in conv_results: |
|
out_path = output_dir / res.input.file.stem |
|
out_path.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
with (out_path / f"{res.input.file.stem}.md").open("w", encoding="utf-8") as fp: |
|
fp.write(res.document.export_to_markdown()) |
|
with (out_path / f"{res.input.file.stem}.json").open("w", encoding="utf-8") as fp: |
|
fp.write(json.dumps(res.document.export_to_dict(), ensure_ascii=False)) |
|
with (out_path / f"{res.input.file.stem}.yaml").open("w", encoding="utf-8") as fp: |
|
fp.write(yaml.safe_dump(res.document.export_to_dict(), allow_unicode=True)) |
|
|
|
|
|
output_files.append(str((out_path / f"{res.input.file.stem}.md").resolve())) |
|
output_files.append(str((out_path / f"{res.input.file.stem}.json").resolve())) |
|
output_files.append(str((out_path / f"{res.input.file.stem}.yaml").resolve())) |
|
|
|
|
|
output_files.extend(export_tables_and_figures(res, out_path)) |
|
|
|
return output_files |
|
|
|
|
|
def gradio_interface(input_file): |
|
output_files = convert_document(input_file) |
|
return output_files |
|
|
|
|
|
iface = gr.Interface( |
|
fn=gradio_interface, |
|
inputs=gr.File(file_count="single", type="filepath"), |
|
outputs=gr.File(file_count="multiple"), |
|
title="Document Conversion with OCR", |
|
description="Upload your document or image, and get the converted output with OCR and other exports.", |
|
allow_flagging="never", |
|
theme=themes.Base(primary_hue="teal", secondary_hue="teal", neutral_hue="slate"), |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |