Spaces:

muhammadsalmanalfaridzi
/

DuckLink

Running

App Files Files Community

muhammadsalmanalfaridzi commited on Dec 4, 2024

Commit

411f800

verified ·

1 Parent(s): 592522e

Upload 2 files

Browse files

Files changed (2) hide show

app.py +171 -0
requirements.txt +2 -0

app.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import logging
+import json
+import yaml
+import gradio as gr
+from pathlib import Path
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption
+from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, EasyOcrOptions
+from docling.utils.export import generate_multimodal_pages
+from docling.utils.utils import create_hash
+import pandas as pd
+import time
+import datetime
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+_log = logging.getLogger(__name__)
+# OCR Configuration
+ocr_options = EasyOcrOptions(force_full_page_ocr=True)
+pipeline_options = PdfPipelineOptions(do_table_structure=True)
+pipeline_options.do_ocr = True  # Enable OCR for images and text
+pipeline_options.table_structure_options.do_cell_matching = True
+pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE  # More accurate table model
+pipeline_options.ocr_options = ocr_options
+pipeline_options.ocr_options.lang = ["id", "en"]  # OCR languages
+# Function to handle document conversion and exports
+def export_tables_and_figures(conv_res, output_dir):
+    """Exports tables, figures, and multimodal pages from the converted document."""
+    start_time = time.time()
+    output_files = []
+    # Export tables
+    for table_ix, table in enumerate(conv_res.document.tables):
+        table_df = table.export_to_dataframe()
+        table_csv_filename = output_dir / f"{conv_res.input.file.stem}-table-{table_ix + 1}.csv"
+        table_html_filename = output_dir / f"{conv_res.input.file.stem}-table-{table_ix + 1}.html"
+        _log.info(f"Saving CSV table to {table_csv_filename}")
+        table_df.to_csv(table_csv_filename)
+        _log.info(f"Saving HTML table to {table_html_filename}")
+        with table_html_filename.open("w") as fp:
+            fp.write(table.export_to_html())
+        # Append to output files
+        output_files.append(table_csv_filename)
+        output_files.append(table_html_filename)
+    # Export pictures (e.g., images with OCR or annotations)
+    for picture_ix, picture in enumerate(conv_res.document.pictures):  # Changed 'figures' to 'pictures'
+        if picture.image:  # Check if picture.image is not None
+            picture_image_filename = output_dir / f"{conv_res.input.file.stem}-picture-{picture_ix + 1}.png"
+            _log.info(f"Saving Picture to {picture_image_filename}")
+            picture.image.save(picture_image_filename)
+            # Append to output files
+            output_files.append(picture_image_filename)
+        else:
+            _log.warning(f"Skipping picture {picture_ix + 1} due to missing image.")
+    # Export multimodal pages
+    rows = []
+    for content_text, content_md, content_dt, page_cells, page_segments, page in generate_multimodal_pages(conv_res):
+        try:
+            dpi = page._default_image_scale * 72
+            # Ensure page.image exists and handle the case where it may be None
+            image_width = image_height = 0
+            image_bytes = None
+            if page.image:
+                image_width = page.image.width
+                image_height = page.image.height
+                image_bytes = page.image.tobytes()
+            rows.append({
+                "document": conv_res.input.file.name,
+                "hash": conv_res.input.document_hash,
+                "page_hash": create_hash(conv_res.input.document_hash + ":" + str(page.page_no - 1)),
+                "image": {
+                    "width": image_width,
+                    "height": image_height,
+                    "bytes": image_bytes,
+                },
+                "cells": page_cells,
+                "contents": content_text,
+                "contents_md": content_md,
+                "contents_dt": content_dt,
+                "segments": page_segments,
+                "extra": {
+                    "page_num": page.page_no + 1,
+                    "width_in_points": page.size.width,
+                    "height_in_points": page.size.height,
+                    "dpi": dpi,
+                },
+            })
+        except Exception as e:
+            _log.warning(f"Failed to process page {page.page_no + 1}: {e}")
+    # Generate one Parquet from all documents
+    df = pd.json_normalize(rows)
+    now = datetime.datetime.now()
+    output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
+    df.to_parquet(output_filename)
+    # Append to output files
+    output_files.append(output_filename)
+    end_time = time.time() - start_time
+    _log.info(f"Tables, figures, and multimodal pages exported in {end_time:.2f} seconds.")
+    return [str(file.resolve()) for file in output_files]
+# Main conversion function
+def convert_document(input_file):
+    # Create a temporary output directory
+    output_dir = Path("scratch")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Create DocumentConverter instance
+    doc_converter = DocumentConverter(
+        allowed_formats=[InputFormat.PDF, InputFormat.IMAGE, InputFormat.DOCX, InputFormat.HTML],
+        format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend)}
+    )
+    # Convert the input file
+    input_path = Path(input_file.name)
+    conv_results = doc_converter.convert_all([input_path])
+    # Export to markdown, json, yaml with UTF-8 encoding
+    output_files = []
+    for res in conv_results:
+        out_path = output_dir / res.input.file.stem
+        out_path.mkdir(parents=True, exist_ok=True)
+        # Export Markdown and JSON with utf-8 encoding
+        with (out_path / f"{res.input.file.stem}.md").open("w", encoding="utf-8") as fp:
+            fp.write(res.document.export_to_markdown())
+        with (out_path / f"{res.input.file.stem}.json").open("w", encoding="utf-8") as fp:
+            fp.write(json.dumps(res.document.export_to_dict(), ensure_ascii=False))
+        with (out_path / f"{res.input.file.stem}.yaml").open("w", encoding="utf-8") as fp:
+            fp.write(yaml.safe_dump(res.document.export_to_dict(), allow_unicode=True))
+        # Append to output files
+        output_files.append(str((out_path / f"{res.input.file.stem}.md").resolve()))
+        output_files.append(str((out_path / f"{res.input.file.stem}.json").resolve()))
+        output_files.append(str((out_path / f"{res.input.file.stem}.yaml").resolve()))
+        # Export tables, figures, and multimodal content
+        output_files.extend(export_tables_and_figures(res, out_path))
+    return output_files
+# Create the Gradio interface
+def gradio_interface(input_file):
+    output_files = convert_document(input_file)
+    return output_files
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=gr.File(file_count="single", type="filepath"),
+    outputs=gr.File(file_count="multiple"),
+    title="Document Conversion with OCR",
+    description="Upload your document or image, and get the converted output with OCR and other exports.",
+    allow_flagging="never",
+)
+if __name__ == "__main__":
+    iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ docling
2	+ gradio