import logging import json import yaml import gradio as gr import gradio.themes as themes from pathlib import Path from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import InputFormat from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, EasyOcrOptions from docling.utils.export import generate_multimodal_pages from docling.utils.utils import create_hash import pandas as pd import time import datetime # Set up logging logging.basicConfig(level=logging.INFO) _log = logging.getLogger(__name__) # OCR Configuration ocr_options = EasyOcrOptions(force_full_page_ocr=True) pipeline_options = PdfPipelineOptions(do_table_structure=True) pipeline_options.do_ocr = True # Enable OCR for images and text pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # More accurate table model pipeline_options.ocr_options = ocr_options pipeline_options.ocr_options.lang = ["id", "en"] # OCR languages # Function to handle document conversion and exports def export_tables_and_figures(conv_res, output_dir): """Exports tables, figures, and multimodal pages from the converted document.""" start_time = time.time() output_files = [] # Export tables for table_ix, table in enumerate(conv_res.document.tables): table_df = table.export_to_dataframe() table_csv_filename = output_dir / f"{conv_res.input.file.stem}-table-{table_ix + 1}.csv" table_html_filename = output_dir / f"{conv_res.input.file.stem}-table-{table_ix + 1}.html" _log.info(f"Saving CSV table to {table_csv_filename}") table_df.to_csv(table_csv_filename) _log.info(f"Saving HTML table to {table_html_filename}") with table_html_filename.open("w") as fp: fp.write(table.export_to_html()) # Append to output files output_files.append(table_csv_filename) output_files.append(table_html_filename) # Export pictures (e.g., images with OCR or annotations) for picture_ix, picture in enumerate(conv_res.document.pictures): # Changed 'figures' to 'pictures' if picture.image: # Check if picture.image is not None picture_image_filename = output_dir / f"{conv_res.input.file.stem}-picture-{picture_ix + 1}.png" _log.info(f"Saving Picture to {picture_image_filename}") picture.image.save(picture_image_filename) # Append to output files output_files.append(picture_image_filename) else: _log.warning(f"Skipping picture {picture_ix + 1} due to missing image.") # Export multimodal pages rows = [] for content_text, content_md, content_dt, page_cells, page_segments, page in generate_multimodal_pages(conv_res): try: dpi = page._default_image_scale * 72 # Ensure page.image exists and handle the case where it may be None image_width = image_height = 0 image_bytes = None if page.image: image_width = page.image.width image_height = page.image.height image_bytes = page.image.tobytes() rows.append({ "document": conv_res.input.file.name, "hash": conv_res.input.document_hash, "page_hash": create_hash(conv_res.input.document_hash + ":" + str(page.page_no - 1)), "image": { "width": image_width, "height": image_height, "bytes": image_bytes, }, "cells": page_cells, "contents": content_text, "contents_md": content_md, "contents_dt": content_dt, "segments": page_segments, "extra": { "page_num": page.page_no + 1, "width_in_points": page.size.width, "height_in_points": page.size.height, "dpi": dpi, }, }) except Exception as e: _log.warning(f"Failed to process page {page.page_no + 1}: {e}") # Generate one Parquet from all documents df = pd.json_normalize(rows) now = datetime.datetime.now() output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet" df.to_parquet(output_filename) # Append to output files output_files.append(output_filename) end_time = time.time() - start_time _log.info(f"Tables, figures, and multimodal pages exported in {end_time:.2f} seconds.") return [str(file.resolve()) for file in output_files] # Main conversion function def convert_document(input_file): # Create a temporary output directory output_dir = Path("scratch") output_dir.mkdir(parents=True, exist_ok=True) # Create DocumentConverter instance doc_converter = DocumentConverter( allowed_formats=[InputFormat.PDF, InputFormat.IMAGE, InputFormat.DOCX, InputFormat.HTML], format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend)} ) # Convert the input file input_path = Path(input_file.name) conv_results = doc_converter.convert_all([input_path]) # Export to markdown, json, yaml with UTF-8 encoding output_files = [] for res in conv_results: out_path = output_dir / res.input.file.stem out_path.mkdir(parents=True, exist_ok=True) # Export Markdown and JSON with utf-8 encoding with (out_path / f"{res.input.file.stem}.md").open("w", encoding="utf-8") as fp: fp.write(res.document.export_to_markdown()) with (out_path / f"{res.input.file.stem}.json").open("w", encoding="utf-8") as fp: fp.write(json.dumps(res.document.export_to_dict(), ensure_ascii=False)) with (out_path / f"{res.input.file.stem}.yaml").open("w", encoding="utf-8") as fp: fp.write(yaml.safe_dump(res.document.export_to_dict(), allow_unicode=True)) # Append to output files output_files.append(str((out_path / f"{res.input.file.stem}.md").resolve())) output_files.append(str((out_path / f"{res.input.file.stem}.json").resolve())) output_files.append(str((out_path / f"{res.input.file.stem}.yaml").resolve())) # Export tables, figures, and multimodal content output_files.extend(export_tables_and_figures(res, out_path)) return output_files # Create the Gradio interface def gradio_interface(input_file): output_files = convert_document(input_file) return output_files # Create the Gradio interface with a theme iface = gr.Interface( fn=gradio_interface, inputs=gr.File(file_count="single", type="filepath"), outputs=gr.File(file_count="multiple"), title="Document Conversion with OCR", description="Upload your document or image, and get the converted output with OCR and other exports.", allow_flagging="never", theme=themes.Base(primary_hue="teal", secondary_hue="teal", neutral_hue="slate"), # Set the theme here ) if __name__ == "__main__": iface.launch()