muhammadsalmanalfaridzi commited on
Commit
411f800
·
verified ·
1 Parent(s): 592522e

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +171 -0
  2. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import json
3
+ import yaml
4
+ import gradio as gr
5
+ from pathlib import Path
6
+ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
7
+ from docling.datamodel.base_models import InputFormat
8
+ from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption
9
+ from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, EasyOcrOptions
10
+ from docling.utils.export import generate_multimodal_pages
11
+ from docling.utils.utils import create_hash
12
+ import pandas as pd
13
+ import time
14
+ import datetime
15
+
16
+ # Set up logging
17
+ logging.basicConfig(level=logging.INFO)
18
+ _log = logging.getLogger(__name__)
19
+
20
+ # OCR Configuration
21
+ ocr_options = EasyOcrOptions(force_full_page_ocr=True)
22
+ pipeline_options = PdfPipelineOptions(do_table_structure=True)
23
+ pipeline_options.do_ocr = True # Enable OCR for images and text
24
+ pipeline_options.table_structure_options.do_cell_matching = True
25
+ pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # More accurate table model
26
+ pipeline_options.ocr_options = ocr_options
27
+ pipeline_options.ocr_options.lang = ["id", "en"] # OCR languages
28
+
29
+ # Function to handle document conversion and exports
30
+ def export_tables_and_figures(conv_res, output_dir):
31
+ """Exports tables, figures, and multimodal pages from the converted document."""
32
+ start_time = time.time()
33
+
34
+ output_files = []
35
+
36
+ # Export tables
37
+ for table_ix, table in enumerate(conv_res.document.tables):
38
+ table_df = table.export_to_dataframe()
39
+ table_csv_filename = output_dir / f"{conv_res.input.file.stem}-table-{table_ix + 1}.csv"
40
+ table_html_filename = output_dir / f"{conv_res.input.file.stem}-table-{table_ix + 1}.html"
41
+
42
+ _log.info(f"Saving CSV table to {table_csv_filename}")
43
+ table_df.to_csv(table_csv_filename)
44
+
45
+ _log.info(f"Saving HTML table to {table_html_filename}")
46
+ with table_html_filename.open("w") as fp:
47
+ fp.write(table.export_to_html())
48
+
49
+ # Append to output files
50
+ output_files.append(table_csv_filename)
51
+ output_files.append(table_html_filename)
52
+
53
+ # Export pictures (e.g., images with OCR or annotations)
54
+ for picture_ix, picture in enumerate(conv_res.document.pictures): # Changed 'figures' to 'pictures'
55
+ if picture.image: # Check if picture.image is not None
56
+ picture_image_filename = output_dir / f"{conv_res.input.file.stem}-picture-{picture_ix + 1}.png"
57
+ _log.info(f"Saving Picture to {picture_image_filename}")
58
+ picture.image.save(picture_image_filename)
59
+
60
+ # Append to output files
61
+ output_files.append(picture_image_filename)
62
+ else:
63
+ _log.warning(f"Skipping picture {picture_ix + 1} due to missing image.")
64
+
65
+ # Export multimodal pages
66
+ rows = []
67
+ for content_text, content_md, content_dt, page_cells, page_segments, page in generate_multimodal_pages(conv_res):
68
+ try:
69
+ dpi = page._default_image_scale * 72
70
+ # Ensure page.image exists and handle the case where it may be None
71
+ image_width = image_height = 0
72
+ image_bytes = None
73
+ if page.image:
74
+ image_width = page.image.width
75
+ image_height = page.image.height
76
+ image_bytes = page.image.tobytes()
77
+
78
+ rows.append({
79
+ "document": conv_res.input.file.name,
80
+ "hash": conv_res.input.document_hash,
81
+ "page_hash": create_hash(conv_res.input.document_hash + ":" + str(page.page_no - 1)),
82
+ "image": {
83
+ "width": image_width,
84
+ "height": image_height,
85
+ "bytes": image_bytes,
86
+ },
87
+ "cells": page_cells,
88
+ "contents": content_text,
89
+ "contents_md": content_md,
90
+ "contents_dt": content_dt,
91
+ "segments": page_segments,
92
+ "extra": {
93
+ "page_num": page.page_no + 1,
94
+ "width_in_points": page.size.width,
95
+ "height_in_points": page.size.height,
96
+ "dpi": dpi,
97
+ },
98
+ })
99
+ except Exception as e:
100
+ _log.warning(f"Failed to process page {page.page_no + 1}: {e}")
101
+
102
+ # Generate one Parquet from all documents
103
+ df = pd.json_normalize(rows)
104
+ now = datetime.datetime.now()
105
+ output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
106
+ df.to_parquet(output_filename)
107
+
108
+ # Append to output files
109
+ output_files.append(output_filename)
110
+
111
+ end_time = time.time() - start_time
112
+ _log.info(f"Tables, figures, and multimodal pages exported in {end_time:.2f} seconds.")
113
+
114
+ return [str(file.resolve()) for file in output_files]
115
+
116
+ # Main conversion function
117
+ def convert_document(input_file):
118
+ # Create a temporary output directory
119
+ output_dir = Path("scratch")
120
+ output_dir.mkdir(parents=True, exist_ok=True)
121
+
122
+ # Create DocumentConverter instance
123
+ doc_converter = DocumentConverter(
124
+ allowed_formats=[InputFormat.PDF, InputFormat.IMAGE, InputFormat.DOCX, InputFormat.HTML],
125
+ format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend)}
126
+ )
127
+
128
+ # Convert the input file
129
+ input_path = Path(input_file.name)
130
+ conv_results = doc_converter.convert_all([input_path])
131
+
132
+ # Export to markdown, json, yaml with UTF-8 encoding
133
+ output_files = []
134
+ for res in conv_results:
135
+ out_path = output_dir / res.input.file.stem
136
+ out_path.mkdir(parents=True, exist_ok=True)
137
+
138
+ # Export Markdown and JSON with utf-8 encoding
139
+ with (out_path / f"{res.input.file.stem}.md").open("w", encoding="utf-8") as fp:
140
+ fp.write(res.document.export_to_markdown())
141
+ with (out_path / f"{res.input.file.stem}.json").open("w", encoding="utf-8") as fp:
142
+ fp.write(json.dumps(res.document.export_to_dict(), ensure_ascii=False))
143
+ with (out_path / f"{res.input.file.stem}.yaml").open("w", encoding="utf-8") as fp:
144
+ fp.write(yaml.safe_dump(res.document.export_to_dict(), allow_unicode=True))
145
+
146
+ # Append to output files
147
+ output_files.append(str((out_path / f"{res.input.file.stem}.md").resolve()))
148
+ output_files.append(str((out_path / f"{res.input.file.stem}.json").resolve()))
149
+ output_files.append(str((out_path / f"{res.input.file.stem}.yaml").resolve()))
150
+
151
+ # Export tables, figures, and multimodal content
152
+ output_files.extend(export_tables_and_figures(res, out_path))
153
+
154
+ return output_files
155
+
156
+ # Create the Gradio interface
157
+ def gradio_interface(input_file):
158
+ output_files = convert_document(input_file)
159
+ return output_files
160
+
161
+ iface = gr.Interface(
162
+ fn=gradio_interface,
163
+ inputs=gr.File(file_count="single", type="filepath"),
164
+ outputs=gr.File(file_count="multiple"),
165
+ title="Document Conversion with OCR",
166
+ description="Upload your document or image, and get the converted output with OCR and other exports.",
167
+ allow_flagging="never",
168
+ )
169
+
170
+ if __name__ == "__main__":
171
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ docling
2
+ gradio