Spaces:

mirnaresearch
/

OCRFlux

Paused

App Files Files Community

mirnaresearch commited on Jul 3

Commit

ca5b08e

0 Parent(s):

Initial commit for HF Space (no images)

Browse files

Files changed (28) hide show

.dockerignore +5 -0
.github/workflows/docker.yml +46 -0
.gitignore +194 -0
Dockerfile +43 -0
LICENSE +201 -0
README.md +397 -0
eval/eval.sh +27 -0
eval/eval_element_merge_detect.py +137 -0
eval/eval_html_table_merge.py +208 -0
eval/eval_page_to_markdown.py +76 -0
eval/eval_page_to_markdown_nanonets.py +160 -0
eval/eval_page_to_markdown_olmocr.py +157 -0
eval/eval_table_to_html.py +206 -0
eval/eval_table_to_html_nanonets.py +295 -0
eval/eval_table_to_html_olmocr.py +212 -0
eval/gen_element_merge_detect_data.py +36 -0
eval/gen_html_table_merge_data.py +32 -0
eval/parallel.py +50 -0
ocrflux/check.py +44 -0
ocrflux/image_utils.py +50 -0
ocrflux/inference.py +237 -0
ocrflux/jsonl_to_markdown.py +37 -0
ocrflux/metrics.py +147 -0
ocrflux/pipeline.py +861 -0
ocrflux/prompts.py +60 -0
ocrflux/table_format.py +143 -0
ocrflux/work_queue.py +357 -0
pyproject.toml +75 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,5 @@

+*
+**/*
+.*
+!ocrflux
+!pyproject.toml

.github/workflows/docker.yml ADDED Viewed

	@@ -0,0 +1,46 @@

+on:
+  push:
+    branches:
+      - main
+      - 'v*.*.*'
+jobs:
+  build_and_push_docker:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Determine image tags
+        id: determine_tags
+        run: |
+          BRANCH_NAME=${{ github.ref_name }}
+          DOCKER_IMAGE_NAME="chatdoc/ocrflux"
+          if [[ "$BRANCH_NAME" == "main" ]]; then
+            echo "IMAGE_TAGS=$DOCKER_IMAGE_NAME:latest,$DOCKER_IMAGE_NAME:$BRANCH_NAME"
+            echo "image_tags=$DOCKER_IMAGE_NAME:latest,$DOCKER_IMAGE_NAME:$BRANCH_NAME" >> $GITHUB_OUTPUT
+          else
+            echo "IMAGE_TAGS=$DOCKER_IMAGE_NAME:$BRANCH_NAME"
+            echo "image_tags=$DOCKER_IMAGE_NAME:$BRANCH_NAME" >> $GITHUB_OUTPUT
+          fi
+      - name: Build and push Docker image
+        id: docker_build
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          push: true
+          tags: ${{ steps.determine_tags.outputs.image_tags }}
+          cache-from: type=gha,scope=${{ github.workflow }}
+          cache-to: type=gha,scope=${{ github.workflow }},mode=max

.gitignore ADDED Viewed

	@@ -0,0 +1,194 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the enitre vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore

Dockerfile ADDED Viewed

	@@ -0,0 +1,43 @@

+FROM ubuntu:24.04
+WORKDIR /OCRFlux
+ENV LANG=en_US.UTF-8 \
+    PIP_ROOT_USER_ACTION=ignore \
+    PIP_BREAK_SYSTEM_PACKAGES=true \
+    PIP_NO_CACHE_DIR=true \
+    PIP_DISABLE_PIP_VERSION_CHECK=true \
+    PYTHONPATH=/OCRFlux
+SHELL ["/bin/bash", "-c"]
+RUN --mount=type=bind,source=./,target=/builder \
+    cp -a /builder/. /OCRFlux/ && \
+    set -o pipefail && \
+    apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+        ca-certificates \
+        curl \
+        fonts-crosextra-caladea \
+        fonts-crosextra-carlito \
+        gsfonts \
+        lcdf-typetools \
+        locales \
+        msttcorefonts \
+        poppler-utils \
+        poppler-data \
+        python3.12-dev \
+        python3.12-full \
+        software-properties-common \
+        ttf-mscorefonts-installer && \
+    locale-gen en_US.UTF-8 && \
+    curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py && \
+    python3.12 /tmp/get-pip.py && \
+    python3.12 -m pip install . --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer/ && \
+    rm -rf ./* \
+        /var/lib/apt/lists/* \
+        /tmp/* \
+        /root/.cache/pip &&\
+    find /var/log /var/cache -type f -delete
+ENTRYPOINT ["python3.12", "-m", "ocrflux.pipeline"]

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,397 @@

+<div align="center">
+<img src="./images/OCRFlux.png" alt="OCRFlux Logo" width="300"/>
+<hr/>
+</div>
+<p align="center">
+  <a href="https://github.com/chatdoc-com/OCRFlux/blob/main/LICENSE">
+    <img alt="GitHub License" src="./images/license.svg" height="20">
+  </a>
+  <a href="https://github.com/chatdoc-com/OCRFlux/releases">
+    <img alt="GitHub release" src="./images/release.svg" height="20">
+  </a>
+  <a href="https://ocrflux.pdfparser.io/">
+    <img alt="Demo" src="./images/demo.svg" height="20">
+  </a>
+  <a href="https://discord.gg/F33mhsAqqg">
+    <img alt="Discord" src="./images/discord.svg" height="20">
+  </a>
+</p>
+OCRFlux is a multimodal large language model based toolkit for converting PDFs and images into clean, readable, plain Markdown text. It aims to push the current state-of-the-art to a significantly higher level.
+Try the online demo: [OCRFlux Demo](https://ocrflux.pdfparser.io/)
+Functions: **Whole file parsing**
+- On each page
+    - Convert into text with a natural reading order, even in the presence of multi-column layouts, figures, and insets
+    - Support for complicated tables and equations
+    - Automatically removes headers and footers
+- Cross-page table/paragraph merging
+    - Cross-page table merging
+    - Cross-page paragraph merging
+Key features:
+- Superior parsing quality on each page
+    It respectively achieves 0.095 higher (from 0.872 to 0.967), 0.109 higher (from 0.858 to 0.967) and 0.187 higher (from 0.780 to 0.967) Edit Distance Similarity (EDS) on our released benchmark [OCRFlux-bench-single](https://huggingface.co/datasets/ChatDOC/OCRFlux-bench-single) than the baseline model [olmOCR-7B-0225-preview](https://huggingface.co/allenai/olmOCR-7B-0225-preview), [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s) and [MonkeyOCR](https://huggingface.co/echo840/MonkeyOCR).
+- Native support for cross-page table/paragraph merging  (to our best this is the first to support this feature in all the open sourced project).
+- Based on a 3B parameter VLM, so it can run even on GTX 3090 GPU.
+Release:
+- [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) - 3B parameter VLM
+- Benchmark for evaluation
+    - [OCRFlux-bench-single](https://huggingface.co/datasets/ChatDOC/OCRFlux-bench-single)
+    - [OCRFlux-pubtabnet-single](https://huggingface.co/datasets/ChatDOC/OCRFlux-pubtabnet-single)
+    - [OCRFlux-bench-cross](https://huggingface.co/datasets/ChatDOC/OCRFlux-bench-cross)
+    - [OCRFlux-pubtabnet-cross](https://huggingface.co/datasets/ChatDOC/OCRFlux-pubtabnet-cross)
+### News
+ - Jun 17, 2025 - v0.1.0 -  Initial public launch and demo.
+### Benchmark for single-page parsing
+We ship two comprehensive benchmarks to help measure the performance of our OCR system in single-page parsing:
+  - [OCRFlux-bench-single](https://huggingface.co/datasets/ChatDOC/OCRFlux-bench-single): Containing 2000 pdf pages (1000 English pages and 1000 Chinese pages) and their ground-truth Markdowns (manually labeled with multi-round check).
+  - [OCRFlux-pubtabnet-single](https://huggingface.co/datasets/ChatDOC/OCRFlux-pubtabnet-single): Derived from the public [PubTabNet](https://github.com/ibm-aur-nlp/PubTabNet) benchmark with some format transformation. It contains 9064 HTML table samples, which are split into simple tables and complex tables according to whether they have rowspan and colspan cells.
+We emphasize that the released benchmarks are NOT included in our training and evaluation data. The following is the main result:
+1. In [OCRFlux-bench-single](https://huggingface.co/datasets/ChatDOC/OCRFlux-bench-single), we calculated the Edit Distance Similarity (EDS) between the generated Markdowns and the ground-truth Markdowns as the metric.
+    <table>
+      <thead>
+        <tr>
+          <th>Language</th>
+          <th>Model</th>
+          <th>Avg EDS ↑</th>
+        </tr>
+      </thead>
+      <tbody>
+        <tr>
+          <td rowspan="4">English</td>
+          <td>olmOCR-7B-0225-preview</td>
+          <td>0.885</td>
+        </tr>
+        <tr>
+          <td>Nanonets-OCR-s</td>
+          <td>0.870</td>
+        </tr>
+        <tr>
+          <td>MonkeyOCR</td>
+          <td>0.828</td>
+        </tr>
+        <tr>
+          <td><strong><a href="https://huggingface.co/ChatDOC/OCRFlux-3B">OCRFlux-3B</a></strong></td>
+          <td>0.971</td>
+        </tr>
+        <tr>
+          <td rowspan="4">Chinese</td>
+          <td>olmOCR-7B-0225-preview</td>
+          <td>0.859</td>
+        </tr>
+        <tr>
+          <td>Nanonets-OCR-s</td>
+          <td>0.846</td>
+        </tr>
+        <tr>
+          <td>MonkeyOCR</td>
+          <td>0.731</td>
+        </tr>
+        <tr>
+          <td><strong><a href="https://huggingface.co/ChatDOC/OCRFlux-3B">OCRFlux-3B</a></strong></td>
+          <td>0.962</td>
+        </tr>
+        <tr>
+          <td rowspan="4">Total</td>
+          <td>olmOCR-7B-0225-preview</td>
+          <td>0.872</td>
+        </tr>
+        <tr>
+          <td>Nanonets-OCR-s</td>
+          <td>0.858</td>
+        </tr>
+        <tr>
+          <td>MonkeyOCR</td>
+          <td>0.780</td>
+        </tr>
+        <tr>
+          <td><strong><a href="https://huggingface.co/ChatDOC/OCRFlux-3B">OCRFlux-3B</a></strong></td>
+          <td>0.967</td>
+        </tr>
+      </tbody>
+    </table>
+2. In [OCRFlux-pubtabnet-single](https://huggingface.co/datasets/ChatDOC/OCRFlux-pubtabnet-single), we calculated the Tree Edit Distance-based Similarity (TEDS) between the generated HTML tables and the ground-truth HTML tables as the metric.
+    <table>
+      <thead>
+        <tr>
+          <th>Type</th>
+          <th>Model</th>
+          <th>Avg TEDS ↑</th>
+        </tr>
+      </thead>
+      <tbody>
+        <tr>
+          <td rowspan="4">Simple</td>
+          <td>olmOCR-7B-0225-preview</td>
+          <td>0.810</td>
+        </tr>
+        <tr>
+          <td>Nanonets-OCR-s</td>
+          <td>0.882</td>
+        </tr>
+        <tr>
+          <td>MonkeyOCR</td>
+          <td>0.880</td>
+        </tr>
+        <tr>
+          <td><strong><a href="https://huggingface.co/ChatDOC/OCRFlux-3B">OCRFlux-3B</a></strong></td>
+          <td>0.912</td>
+        </tr>
+        <tr>
+          <td rowspan="4">Complex</td>
+          <td>olmOCR-7B-0225-preview</td>
+          <td>0.676</td>
+        </tr>
+        <tr>
+          <td>Nanonets-OCR-s</td>
+          <td>0.772</td>
+        </tr>
+        <tr>
+          <td><strong>MonkeyOCR<strong></td>
+          <td>0.826</td>
+        </tr>
+        <tr>
+          <td><a href="https://huggingface.co/ChatDOC/OCRFlux-3B">OCRFlux-3B</a></td>
+          <td>0.807</td>
+        </tr>
+        <tr>
+          <td rowspan="4">Total</td>
+          <td>olmOCR-7B-0225-preview</td>
+          <td>0.744</td>
+        </tr>
+        <tr>
+          <td>Nanonets-OCR-s</td>
+          <td>0.828</td>
+        </tr>
+        <tr>
+          <td>MonkeyOCR</td>
+          <td>0.853</td>
+        </tr>
+        <tr>
+          <td><strong><a href="https://huggingface.co/ChatDOC/OCRFlux-3B">OCRFlux-3B</a></strong></td>
+          <td>0.861</td>
+        </tr>
+      </tbody>
+    </table>
+We also conduct some case studies to show the superiority of our model in the [blog](https://ocrflux.pdfparser.io/#/blog) article.
+### Benchmark for cross-page table/paragraph merging
+PDF documents are typically paginated, which often results in tables or paragraphs being split across consecutive pages. Accurately detecting and merging such cross-page structures is crucial to avoid generating incomplete or fragmented content.
+The detection task can be formulated as follows: given the Markdowns of two consecutive pages—each structured as a list of Markdown elements (e.g., paragraphs and tables)—the goal is to identify the indexes of elements that should be merged across the pages.
+Then for the merging task, if the elements to be merged are paragraphs, we can just concate them. However, for two table fragments, their merging is much more challenging. For example, the table spanning multiple pages will repeat the header of the first page on the second page. Another difficult scenario is that the table cell contains long content that spans multiple lines within the cell, with the first few lines appearing on the previous page and the remaining lines continuing on the next page. We also observe some cases where tables with a large number of columns are split vertically and placed on two consecutive pages. More examples of cross-page tables can be found in our [blog](https://ocrflux.pdfparser.io/#/blog) article. To address these issues, we develop the LLM model for cross-page table merging. Specifically, this model takes two split table fragments as input and generates a complete, well-structured table as output.
+We ship two comprehensive benchmarks to help measure the performance of our OCR system in cross-page table/paragraph detection and merging tasks respectively:
+  - [OCRFlux-bench-cross](https://huggingface.co/datasets/ChatDOC/OCRFlux-bench-cross): Containing 1000 samples (500 English samples and 500 Chinese samples), each sample contains the Markdown element lists of two consecutive pages, along with the indexes of elements that need to be merged (manually labeled through multiple rounds of review). If no tables or paragraphs require merging, the indexes in the annotation data are left empty.
+  - [OCRFlux-pubtabnet-cross](https://huggingface.co/datasets/ChatDOC/OCRFlux-pubtabnet-cross): Containing 9064 pairs of split table fragments, along with their corresponding ground-truth merged versions.
+The released benchmarks are NOT included in our training and evaluation data neither. The following is the main result:
+1. In [OCRFlux-bench-cross](https://huggingface.co/datasets/ChatDOC/OCRFlux-bench-cross), we caculated the Accuracy, Precision, Recall and F1 score as the metric. Notice that the detection results are right only when it accurately judges whether there are elements that need to be merged across the two pages and output the right indexes of them.
+    | Language | Precision ↑ | Recall ↑ | F1 ↑  | Accuracy ↑ |
+    |----------|-------------|----------|-------|------------|
+    | English  | 0.992       | 0.964    | 0.978 | 0.978      |
+    | Chinese  | 1.000       | 0.988    | 0.994 | 0.994      |
+    | Total    | 0.996       | 0.976    | 0.986 | 0.986      |
+2. In [OCRFlux-pubtabnet-cross](https://huggingface.co/datasets/ChatDOC/OCRFlux-pubtabnet-cross), we calculate the Tree Edit Distance-based Similarity (TEDS) between the generated merged table and the ground-truth merged table as the metric.
+    | Table type | Avg TEDS ↑   |
+    |------------|--------------|
+    | Simple     | 0.965        |
+    | Complex    | 0.935        |
+    | Total      | 0.950        |
+### Installation
+Requirements:
+ - Recent NVIDIA GPU (tested on RTX 3090, 4090, L40S, A100, H100) with at least 12 GB of GPU RAM
+ - 20GB of free disk space
+You will need to install poppler-utils and additional fonts for rendering PDF images.
+Install dependencies (Ubuntu/Debian)
+```bash
+sudo apt-get update
+sudo apt-get install poppler-utils poppler-data ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools
+```
+Set up a conda environment and install OCRFlux. The requirements for running OCRFlux
+are difficult to install in an existing python environment, so please do make a clean python environment to install into.
+```bash
+conda create -n ocrflux python=3.11
+conda activate ocrflux
+git clone https://github.com/chatdoc-com/OCRFlux.git
+cd ocrflux
+pip install -e . --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer/
+```
+### Local Usage Example
+For quick testing, try the [web demo](https://5f65ccdc2d4fd2f364.gradio.live). To run locally, a GPU is required, as inference is powered by [vllm](hhttps://github.com/vllm-project/vllm) under the hood.
+- For a pdf document:
+    ```bash
+    python -m ocrflux.pipeline ./localworkspace --data test.pdf --model /model_dir/OCRFlux-3B
+    ```
+- For an image:
+    ```bash
+    python -m ocrflux.pipeline ./localworkspace --data test_page.png --model /model_dir/OCRFlux-3B
+    ```
+- For a directory of pdf or images:
+    ```bash
+    python -m ocrflux.pipeline ./localworkspace --data test_pdf_dir/* --model /model_dir/OCRFlux-3B
+    ```
+You can set `--skip_cross_page_merge` to skip the cross-page merging in the parsing process to accelerate, it would simply concatenate the parsing results of each page to generate final Markdown of the document.
+Results will be stored as JSONL files in the `./localworkspace/results` directory.
+Each line in JSONL files is a json object with the following fields:
+```
+{
+    "orig_path": str,  # the path to the raw pdf or image file
+    "num_pages": int,  # the number of pages in the pdf file
+    "document_text": str, # the Markdown text of the converted pdf or image file
+    "page_texts": dict, # the Markdown texts of each page in the pdf file, the key is the page index and the value is the Markdown text of the page
+    "fallback_pages": [int], # the page indexes that are not converted successfully
+}
+```
+### API for directly calling OCRFlux (New)
+You can use the inference API to directly call OCRFlux in your codes without using an online vllm server like following:
+```
+from vllm import LLM
+from ocrflux.inference import parse
+file_path = 'test.pdf'
+# file_path = 'test.png'
+llm = LLM(model="model_dir/OCRFlux-3B",gpu_memory_utilization=0.8,max_model_len=8192)
+result = parse(llm,file_path)
+if result != None:
+    document_markdown = result['document_text']
+    print(document_markdown)
+    with open('test.md','w') as f:
+        f.write(document_markdown)
+else:
+    print("Parse failed.")
+```
+If parsing is failed or there are fallback pages in the result, you can try to set the argument `max_page_retries` for the `parse` function with a positive integer to get a better result. But it may cause longer inference time.
+### Docker Usage
+Requirements:
+- Docker with GPU support [(NVIDIA Toolkit)](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
+- Pre-downloaded model: [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B)
+To use OCRFlux in a docker container, you can use the following example command:
+```bash
+docker run -it --gpus all \
+  -v /path/to/localworkspace:/localworkspace \
+  -v /path/to/test_pdf_dir:/test_pdf_dir/ \
+  -v /path/to/OCRFlux-3B:/OCRFlux-3B \
+  chatdoc/ocrflux:latest /localworkspace --data /test_pdf_dir/* --model /OCRFlux-3B/
+```
+#### Viewing Results
+Generate the final Markdown files by running the following command. Generated Markdown files will be in `./localworkspace/markdowns/DOCUMENT_NAME` directory.
+```bash
+python -m ocrflux.jsonl_to_markdown ./localworkspace
+```
+### Full documentation for the pipeline
+```bash
+python -m ocrflux.pipeline --help
+usage: pipeline.py [-h] [--task {pdf2markdown,merge_pages,merge_tables}] [--data [DATA ...]] [--pages_per_group PAGES_PER_GROUP] [--max_page_retries MAX_PAGE_RETRIES]
+                   [--max_page_error_rate MAX_PAGE_ERROR_RATE] [--workers WORKERS] [--model MODEL] [--model_max_context MODEL_MAX_CONTEXT] [--model_chat_template MODEL_CHAT_TEMPLATE]
+                   [--target_longest_image_dim TARGET_LONGEST_IMAGE_DIM] [--skip_cross_page_merge] [--port PORT]
+                   workspace
+Manager for running millions of PDFs through a batch inference pipeline
+positional arguments:
+  workspace             The filesystem path where work will be stored, can be a local folder
+options:
+  -h, --help            show this help message and exit
+  --data [DATA ...]     List of paths to files to process
+  --pages_per_group PAGES_PER_GROUP
+                        Aiming for this many pdf pages per work item group
+  --max_page_retries MAX_PAGE_RETRIES
+                        Max number of times we will retry rendering a page
+  --max_page_error_rate MAX_PAGE_ERROR_RATE
+                        Rate of allowable failed pages in a document, 1/250 by default
+  --workers WORKERS     Number of workers to run at a time
+  --model MODEL         The path to the model
+  --model_max_context MODEL_MAX_CONTEXT
+                        Maximum context length that the model was fine tuned under
+  --model_chat_template MODEL_CHAT_TEMPLATE
+                        Chat template to pass to vllm server
+  --target_longest_image_dim TARGET_LONGEST_IMAGE_DIM
+                        Dimension on longest side to use for rendering the pdf pages
+  --skip_cross_page_merge
+                        Whether to skip cross-page merging
+  --port PORT           Port to use for the VLLM server
+```
+## Code overview
+There are some nice reusable pieces of the code that may be useful for your own projects:
+ - Processing millions of PDFs through our released model using VLLM - [pipeline.py](https://github.com/chatdoc-com/OCRFlux/blob/main/ocrflux/pipeline.py)
+ - Generating final Markdowns from jsonl files - [jsonl_to_markdown.py](https://github.com/chatdoc-com/OCRFlux/blob/main/ocrflux/jsonl_to_markdown.py)
+ - Evaluating the model on the single-page parsing task - [eval_page_to_markdown.py](https://github.com/chatdoc-com/OCRFlux/blob/main/eval/eval_page_to_markdown.py)
+ - Evaluating the model on the table parising task - [eval_table_to_html.py](https://github.com/chatdoc-com/OCRFlux/blob/main/eval/eval_table_to_html.py)
+ - Evaluating the model on the paragraphs/tables merging detection task - [eval_element_merge_detect.py](https://github.com/chatdoc-com/OCRFlux/blob/main/eval/eval_element_merge_detect.py)
+ - Evaluating the model on the table merging task - [eval_html_table_merge.py](https://github.com/chatdoc-com/OCRFlux/blob/main/eval/eval_html_table_merge.py)
+## Team
+<!-- start team -->
+**OCRFlux** is developed and maintained by the ChatDOC team, backed by [ChatDOC](https://chatdoc.com/).
+<!-- end team -->
+## License
+<!-- start license -->
+**OCRFlux** is licensed under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0).
+A full copy of the license can be found [on GitHub](https://github.com/allenai/OCRFlux/blob/main/LICENSE).
+<!-- end license -->

eval/eval.sh ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/bin/bash
+# page_to_markdown task
+python -m ocrflux.pipeline ./eval_page_to_markdown_result --task pdf2markdown --data /data/OCRFlux-bench-single/pdfs/*.pdf --model /data/OCRFlux-7B
+python -m eval.eval_page_to_markdown ./eval_page_to_markdown_result --gt_file /data/OCRFlux-bench-single/data.jsonl
+# element_merge_detect task
+python -m eval.gen_element_merge_detect_data /data/OCRFlux-bench-cross
+python -m ocrflux.pipeline ./eval_element_merge_detect_result --task merge_pages --data /data/OCRFlux-bench-cross/jsons/*.json --model /data/OCRFlux-7B
+python -m eval.eval_element_merge_detect ./eval_element_merge_detect_result --gt_file /data/OCRFlux-bench-cross/data.jsonl
+# table_to_html task
+python -m ocrflux.pipeline ./eval_table_to_html_result --task pdf2markdown --data /data/OCRFlux-pubtabnet-single/images/*.png --model /data/OCRFlux-7B
+python -m eval.eval_table_to_html ./eval_table_to_html_result --gt_file /data/OCRFlux-pubtabnet-single/data.jsonl
+# html_table_merge task
+python -m eval.gen_html_table_merge_data /data/OCRFlux-pubtabnet-cross
+python -m ocrflux.pipeline ./eval_html_table_merge_result --task merge_tables --data /data/OCRFlux-pubtabnet-cross/jsons/*.json --model /data/OCRFlux-7B
+python -m eval.eval_html_table_merge ./eval_html_table_merge_result --gt_file /data/OCRFlux-pubtabnet-cross/data.jsonl

eval/eval_element_merge_detect.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import os
+import json
+import argparse
+import nltk
+from tqdm import tqdm
+from eval.parallel import parallel_process
+def evaluate(pred, gt):
+    pred = sorted(pred, key=lambda x: (x[0], x[1]))
+    gt = sorted(gt, key=lambda x: (x[0], x[1]))
+    if pred == gt:
+        return 1
+    else:
+        return 0
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate element_merge_detect task")
+    parser.add_argument(
+        "workspace",
+        help="The filesystem path where work will be stored, can be a local folder",
+    )
+    parser.add_argument(
+        "--gt_file",
+        help="Ground truth file",
+    )
+    parser.add_argument("--n_jobs", type=int, default=40, help="Number of jobs to run in parallel")
+    args = parser.parse_args()
+    pred_data = {}
+    root_dir = os.path.join(args.workspace, "results")
+    for jsonl_file in os.listdir(root_dir):
+        if jsonl_file.endswith(".jsonl"):
+            with open(os.path.join(root_dir, jsonl_file), "r") as f:
+                for line in f:
+                    data = json.loads(line)
+                    pred_data[os.path.basename(data['orig_path'])] = data['merge_pairs']
+    filename_list_en = []
+    filename_list_zh = []
+    gt_data = {}
+    with open(args.gt_file, "r") as f:
+        for line in f:
+            data = json.loads(line)
+            pdf_name_1 = data['pdf_name_1'].split(".")[0]
+            pdf_name_2 = data['pdf_name_2'].split(".")[0]
+            pdf_name,page_1 = pdf_name_1.split('_')
+            pdf_name,page_2 = pdf_name_2.split('_')
+            json_name = pdf_name + '_' + page_1 + '_' + page_2 + '.json'
+            gt_data[json_name] = data['merging_idx_pairs']
+            if data['language'] == 'en':
+                filename_list_en.append(json_name)
+            else:
+                filename_list_zh.append(json_name)
+    keys = list(gt_data.keys())
+    if args.n_jobs == 1:
+        scores = [evaluate(pred_data.get(filename, []), gt_data.get(filename, [])) for filename in tqdm(keys)]
+    else:
+        inputs = [{'pred': pred_data.get(filename, []), 'gt': gt_data.get(filename, [])} for filename in keys]
+        scores = parallel_process(inputs, evaluate, use_kwargs=True, n_jobs=args.n_jobs, front_num=1)
+    tp_en = 0
+    tn_en = 0
+    fp_en = 0
+    fn_en = 0
+    tp_zh = 0
+    tn_zh = 0
+    fp_zh = 0
+    fn_zh = 0
+    score_en = 0
+    score_zh = 0
+    num_en = 0
+    num_zh = 0
+    for filename, score in zip(keys, scores):
+        print(filename)
+        print(score)
+        print()
+        pred_label = pred_data[filename]
+        if filename in filename_list_en:
+            if pred_label == []:
+                if score == 1:
+                    tn_en += 1
+                else:
+                    fn_en += 1
+            else:
+                if score == 1:
+                    tp_en += 1
+                else:
+                    fp_en += 1
+            score_en += score
+            num_en += 1
+        elif filename in filename_list_zh:
+            if pred_label == []:
+                if score == 1:
+                    tn_zh += 1
+                else:
+                    fn_zh += 1
+            else:
+                if score == 1:
+                    tp_zh += 1
+                else:
+                    fp_zh += 1
+            score_zh += score
+            num_zh += 1
+    precision_en = tp_en / (tp_en + fp_en)
+    recall_en = tp_en / (tp_en + fn_en)
+    f1_en = 2*precision_en*recall_en / (precision_en+recall_en)
+    acc_en = score_en / num_en
+    precision_zh = tp_zh / (tp_zh + fp_zh)
+    recall_zh = tp_zh / (tp_zh + fn_zh)
+    f1_zh = 2*precision_zh*recall_zh / (precision_zh+recall_zh)
+    acc_zh = score_zh / num_zh
+    tp = tp_en + tp_zh
+    fp = fp_en + fp_zh
+    fn = fn_en + fn_zh
+    score = score_en + score_zh
+    num = num_en + num_zh
+    precision = tp / (tp + fp)
+    recall =  tp / (tp + fn)
+    f1 = 2*precision*recall / (precision+recall)
+    acc = score / num
+    print(f"EN: {precision_en} / {recall_en} / {f1_en} / {acc_en}")
+    print(f"ZH: {precision_zh} / {recall_zh} / {f1_zh} / {acc_zh}")
+    print(f"ALL: {precision} / {recall} / {f1} / {acc}")
+if __name__ == "__main__":
+    main()

eval/eval_html_table_merge.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import os
+import json
+import argparse
+import distance
+from apted import APTED, Config
+from apted.helpers import Tree
+from lxml import etree, html
+from collections import deque
+from tqdm import tqdm
+from eval.parallel import parallel_process
+class TableTree(Tree):
+    def __init__(self, tag, colspan=None, rowspan=None, content=None, *children):
+        self.tag = tag
+        self.colspan = colspan
+        self.rowspan = rowspan
+        self.content = content
+        self.children = list(children)
+    def bracket(self):
+        """Show tree using brackets notation"""
+        if self.tag == 'td':
+            result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \
+                     (self.tag, self.colspan, self.rowspan, self.content)
+        else:
+            result = '"tag": %s' % self.tag
+        for child in self.children:
+            result += child.bracket()
+        return "{{{}}}".format(result)
+class CustomConfig(Config):
+    @staticmethod
+    def maximum(*sequences):
+        """Get maximum possible value
+        """
+        return max(map(len, sequences))
+    def normalized_distance(self, *sequences):
+        """Get distance from 0 to 1
+        """
+        return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
+    def rename(self, node1, node2):
+        """Compares attributes of trees"""
+        if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
+            return 1.
+        if node1.tag == 'td':
+            if node1.content or node2.content:
+                return self.normalized_distance(node1.content, node2.content)
+        return 0.
+class TEDS(object):
+    ''' Tree Edit Distance basead Similarity
+    '''
+    def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None):
+        assert isinstance(n_jobs, int) and (n_jobs >= 1), 'n_jobs must be an integer greather than 1'
+        self.structure_only = structure_only
+        self.n_jobs = n_jobs
+        self.ignore_nodes = ignore_nodes
+        self.__tokens__ = []
+    def tokenize(self, node):
+        ''' Tokenizes table cells
+        '''
+        self.__tokens__.append('<%s>' % node.tag)
+        if node.text is not None:
+            self.__tokens__ += list(node.text)
+        for n in node.getchildren():
+            self.tokenize(n)
+        if node.tag != 'unk':
+            self.__tokens__.append('</%s>' % node.tag)
+        if node.tag != 'td' and node.tail is not None:
+            self.__tokens__ += list(node.tail)
+    def load_html_tree(self, node, parent=None):
+        ''' Converts HTML tree to the format required by apted
+        '''
+        global __tokens__
+        if node.tag == 'td':
+            if self.structure_only:
+                cell = []
+            else:
+                self.__tokens__ = []
+                self.tokenize(node)
+                cell = self.__tokens__[1:-1].copy()
+            new_node = TableTree(node.tag,
+                                 int(node.attrib.get('colspan', '1')),
+                                 int(node.attrib.get('rowspan', '1')),
+                                 cell, *deque())
+        else:
+            new_node = TableTree(node.tag, None, None, None, *deque())
+        if parent is not None:
+            parent.children.append(new_node)
+        if node.tag != 'td':
+            for n in node.getchildren():
+                self.load_html_tree(n, new_node)
+        if parent is None:
+            return new_node
+    def evaluate(self, pred, true):
+        ''' Computes TEDS score between the prediction and the ground truth of a
+            given sample
+        '''
+        if (not pred) or (not true):
+            return 0.0
+        pred = "<html>" + pred + "</html>"
+        true = "<html>" + true + "</html>"
+        parser = html.HTMLParser(remove_comments=True, encoding='utf-8')
+        pred = html.fromstring(pred, parser=parser)
+        true = html.fromstring(true, parser=parser)
+        if pred.xpath('body/table') and true.xpath('body/table'):
+            pred = pred.xpath('body/table')[0]
+            true = true.xpath('body/table')[0]
+            if self.ignore_nodes:
+                etree.strip_tags(pred, *self.ignore_nodes)
+                etree.strip_tags(true, *self.ignore_nodes)
+            n_nodes_pred = len(pred.xpath(".//*"))
+            n_nodes_true = len(true.xpath(".//*"))
+            n_nodes = max(n_nodes_pred, n_nodes_true)
+            tree_pred = self.load_html_tree(pred)
+            tree_true = self.load_html_tree(true)
+            distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance()
+            return 1.0 - (float(distance) / n_nodes)
+        else:
+            return 0.0
+    def batch_evaluate(self, pred_json, true_json):
+        ''' Computes TEDS score between the prediction and the ground truth of
+            a batch of samples
+            @params pred_json: {'FILENAME': 'HTML CODE', ...}
+            @params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...}
+            @output: {'FILENAME': 'TEDS SCORE', ...}
+        '''
+        samples = true_json.keys()
+        if self.n_jobs == 1:
+            scores = [self.evaluate(pred_json.get(filename, ''), true_json[filename]['html']) for filename in tqdm(samples)]
+        else:
+            inputs = [{'pred': pred_json.get(filename, ''), 'true': true_json[filename]['html']} for filename in samples]
+            scores = parallel_process(inputs, self.evaluate, use_kwargs=True, n_jobs=self.n_jobs, front_num=1)
+        total_score_simple = 0
+        num_simple = 0
+        total_score_complex = 0
+        num_complex = 0
+        total_score = 0
+        num_total = 0
+        for filename,score in zip(samples, scores):
+            print(filename)
+            print(score)
+            print('')
+            if true_json[filename]['type'] == 'simple':
+                total_score_simple += score
+                num_simple += 1
+            elif true_json[filename]['type'] == 'complex':
+                total_score_complex += score
+                num_complex += 1
+            else:
+                raise ValueError('Unknown type: %s' % true_json[filename]['type'])
+            total_score += score
+            num_total += 1
+        if num_simple > 0:
+            avg_score_simple = total_score_simple / num_simple
+        else:
+            avg_score_simple = 0
+        if num_complex > 0:
+            avg_score_complex = total_score_complex / num_complex
+        else:
+            avg_score_complex = 0
+        avg_score = total_score / num_total
+        print({'simple': (num_simple,avg_score_simple), 'complex': (num_complex,avg_score_complex), 'total': (num_total,avg_score)})
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate page_to_markdown task")
+    parser.add_argument(
+        "workspace",
+        help="The filesystem path where work will be stored, can be a local folder",
+    )
+    parser.add_argument(
+        "--gt_file",
+        help="Ground truth file",
+    )
+    parser.add_argument("--n_jobs", type=int, default=40, help="Number of jobs to run in parallel")
+    args = parser.parse_args()
+    pred_data = {}
+    root_dir = os.path.join(args.workspace, "results")
+    for jsonl_file in os.listdir(root_dir):
+        if jsonl_file.endswith(".jsonl"):
+            with open(os.path.join(root_dir, jsonl_file), "r") as f:
+                for line in f:
+                    data = json.loads(line)
+                    key = os.path.basename(data['orig_path']).split('.')[0]
+                    pred_data[key] = data['merged_tables']
+    gt_data = {}
+    with open(args.gt_file, "r") as f:
+        for line in f:
+            data = json.loads(line)
+            key = data['image_name'].split('.')[0]
+            gt_data[key] = {'html':data['gt_table'], 'type':data['type']}
+    teds = TEDS(n_jobs=args.n_jobs, ignore_nodes=['b', 'thead', 'tbody'])
+    teds.batch_evaluate(pred_data, gt_data)
+if __name__ == "__main__":
+    main()

eval/eval_page_to_markdown.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import os
+import json
+import argparse
+import nltk
+from tqdm import tqdm
+from eval.parallel import parallel_process
+def evaluate(pred, gt):
+    edit_dist = nltk.edit_distance(pred, gt) / max(len(pred), len(gt))
+    return 1.0 - edit_dist
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate page_to_markdown task")
+    parser.add_argument(
+        "workspace",
+        help="The filesystem path where work will be stored, can be a local folder",
+    )
+    parser.add_argument(
+        "--gt_file",
+        help="Ground truth file",
+    )
+    parser.add_argument("--n_jobs", type=int, default=40, help="Number of jobs to run in parallel")
+    args = parser.parse_args()
+    pred_data = {}
+    root_dir = os.path.join(args.workspace, "results")
+    for jsonl_file in os.listdir(root_dir):
+        if jsonl_file.endswith(".jsonl"):
+            with open(os.path.join(root_dir, jsonl_file), "r") as f:
+                for line in f:
+                    data = json.loads(line)
+                    pred_data[os.path.basename(data['orig_path'])] = data['document_text']
+    filename_list_en = []
+    filename_list_zh = []
+    gt_data = {}
+    with open(args.gt_file, "r") as f:
+        for line in f:
+            data = json.loads(line)
+            markdown = data['markdown']
+            pdf_name = data['pdf_name']
+            gt_data[pdf_name] = markdown
+            if data['language'] == 'en':
+                filename_list_en.append(pdf_name)
+            else:
+                filename_list_zh.append(pdf_name)
+    keys = list(gt_data.keys())
+    if args.n_jobs == 1:
+        scores = [evaluate(pred_data.get(filename, ''), gt_data.get(filename, '')) for filename in tqdm(keys)]
+    else:
+        inputs = [{'pred': pred_data.get(filename, ''), 'gt': gt_data.get(filename, '')} for filename in keys]
+        scores = parallel_process(inputs, evaluate, use_kwargs=True, n_jobs=args.n_jobs, front_num=1)
+    total_score_en = 0
+    total_num_en = 0
+    total_score_zh = 0
+    total_num_zh = 0
+    for filename, score in zip(keys, scores):
+        print(filename)
+        print(score)
+        print()
+        if filename in filename_list_en:
+            total_score_en += score
+            total_num_en += 1
+        elif filename in filename_list_zh:
+            total_score_zh += score
+            total_num_zh += 1
+    print(f"English: {total_score_en / total_num_en}")
+    print(f"Chinese: {total_score_zh / total_num_zh}")
+    print(f"Total: {sum(scores) / len(scores)}")
+if __name__ == "__main__":
+    main()

eval/eval_page_to_markdown_nanonets.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import os
+import re
+import json
+import argparse
+import nltk
+import markdown2
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+from eval.parallel import parallel_process
+def turn_header_to_h1(line):
+    # 检查是否是以一个或多个 '#' 开头的标题行
+    if line.lstrip().startswith('#'):
+        # 去掉开头的 '#' 和其后的空格
+        new_line = "# " + line.lstrip().lstrip('#').lstrip()
+        return new_line
+    else:
+        return line
+def replace_single_dollar(markdown_text):
+    pattern = r'\$(.*?)\$'
+    def replace_with_brackets(match):
+        formula_content = match.group(1)  # 获取匹配到的公式内容
+        return f'\\({formula_content}\\)'
+    replaced_text = re.sub(pattern, replace_with_brackets, markdown_text, flags=re.DOTALL)
+    return replaced_text
+def replace_double_dollar(markdown_text):
+    pattern = r'\$\$(.*?)\$\$'
+    def replace_with_brackets(match):
+        formula_content = match.group(1)
+        return f'\\[{formula_content}\\]'
+    replaced_text = re.sub(pattern, replace_with_brackets, markdown_text, flags=re.DOTALL)
+    return replaced_text
+def simplify_html_table(html_table):
+    # 使用 BeautifulSoup 解析 HTML
+    soup = BeautifulSoup(html_table, 'html.parser')
+    # 找到 <table> 标签
+    table = soup.find('table')
+    if not table:
+        raise ValueError("输入的 HTML 不包含有效的 <table> 标签")
+    # 创建一个新的 <table> 标签
+    new_table = BeautifulSoup('<table></table>', 'html.parser').table
+    # 提取所有行（包括 <thead> 和 <tbody> 中的行）
+    rows = table.find_all(['tr'], recursive=True)
+    for row in rows:
+        # 创建新的 <tr> 标签
+        new_row = soup.new_tag('tr')
+        # 处理每一行中的单元格
+        cells = row.find_all(['th', 'td'])
+        for cell in cells:
+            # 将 <th> 替换为 <td>
+            new_cell = soup.new_tag('td')
+            if cell.has_attr('rowspan'):
+                new_cell['rowspan'] = cell['rowspan']
+            if cell.has_attr('colspan'):
+                new_cell['colspan'] = cell['colspan']
+            new_cell.string = cell.get_text(strip=True)  # 保留单元格内容
+            new_row.append(new_cell)
+        # 将新行添加到新表格中
+        new_table.append(new_row)
+    # 返回简化后的表格 HTML
+    return str(new_table)
+def evaluate(pred, gt):
+    edit_dist = nltk.edit_distance(pred, gt) / max(len(pred), len(gt))
+    return 1.0- edit_dist
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate page_to_markdown task")
+    parser.add_argument(
+        "workspace",
+        help="The filesystem path where work will be stored, can be a local folder",
+    )
+    parser.add_argument(
+        "--gt_file",
+        help="Ground truth file",
+    )
+    parser.add_argument("--n_jobs", type=int, default=40, help="Number of jobs to run in parallel")
+    args = parser.parse_args()
+    pred_data = {}
+    for file in os.listdir(args.workspace):
+        file_path = os.path.join(args.workspace, file)
+        pdf_name = file.split('.')[0] + ".pdf"
+        with open(file_path, "r") as f:
+            document_text = f.read()
+            document_text = replace_single_dollar(replace_double_dollar(document_text))
+            markdown_text_list = document_text.split("\n\n")
+            new_markdown_text_list = []
+            for text in markdown_text_list:
+                text = text.strip()
+                if (text.startswith("<watermark>") and text.endswith("</watermark>")) or (text.startswith("<img>") and text.endswith("</img>")) or (text.startswith("<page_number>") and text.endswith("</page_number>")) or (text.startswith("<signature>") and text.endswith("</signature>")):
+                    continue
+                else:
+                    html_text = str(markdown2.markdown(text,extras=["tables"]))
+                    html_text = html_text.strip()
+                    if html_text.startswith("<table>") and html_text.endswith("</table>"):
+                        html_table = simplify_html_table(html_text)
+                        new_markdown_text_list.append(html_table)
+                    else:
+                        text = turn_header_to_h1(text)
+                        new_markdown_text_list.append(text)
+            pred_data[os.path.basename(pdf_name)] = "\n\n".join(new_markdown_text_list)
+    filename_list_en = []
+    filename_list_zh = []
+    gt_data = {}
+    with open(args.gt_file, "r") as f:
+        for line in f:
+            data = json.loads(line)
+            markdown = data['markdown']
+            pdf_name = data['pdf_name']
+            gt_data[pdf_name] = markdown
+            if data['language'] == 'en':
+                filename_list_en.append(pdf_name)
+            else:
+                filename_list_zh.append(pdf_name)
+    keys = list(gt_data.keys())
+    if args.n_jobs == 1:
+        scores = [evaluate(pred_data.get(filename, ''), gt_data.get(filename, '')) for filename in tqdm(keys)]
+    else:
+        inputs = [{'pred': pred_data.get(filename, ''), 'gt': gt_data.get(filename, '')} for filename in keys]
+        scores = parallel_process(inputs, evaluate, use_kwargs=True, n_jobs=args.n_jobs, front_num=1)
+    total_score_en = 0
+    total_num_en = 0
+    total_score_zh = 0
+    total_num_zh = 0
+    for filename, score in zip(keys, scores):
+        if filename in filename_list_en:
+            print(filename)
+            print(score)
+            print()
+            total_score_en += score
+            total_num_en += 1
+        elif filename in filename_list_zh:
+            total_score_zh += score
+            total_num_zh += 1
+    print(f"English: {total_score_en / total_num_en}")
+    print(f"Chinese: {total_score_zh / total_num_zh}")
+    print(f"Total: {sum(scores) / len(scores)}")
+if __name__ == "__main__":
+    main()

eval/eval_page_to_markdown_olmocr.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import os
+import re
+import json
+import argparse
+import nltk
+import markdown2
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+from eval.parallel import parallel_process
+def turn_header_to_h1(line):
+    # 检查是否是以一个或多个 '#' 开头的标题行
+    if line.lstrip().startswith('#'):
+        # 去掉开头的 '#' 和其后的空格
+        new_line = "# " + line.lstrip().lstrip('#').lstrip()
+        return new_line
+    else:
+        return line
+def replace_single_dollar(markdown_text):
+    pattern = r'\$(.*?)\$'
+    def replace_with_brackets(match):
+        formula_content = match.group(1)  # 获取匹配到的公式内容
+        return f'\\({formula_content}\\)'
+    replaced_text = re.sub(pattern, replace_with_brackets, markdown_text, flags=re.DOTALL)
+    return replaced_text
+def replace_double_dollar(markdown_text):
+    pattern = r'\$\$(.*?)\$\$'
+    def replace_with_brackets(match):
+        formula_content = match.group(1)
+        return f'\\[{formula_content}\\]'
+    replaced_text = re.sub(pattern, replace_with_brackets, markdown_text, flags=re.DOTALL)
+    return replaced_text
+def simplify_html_table(html_table):
+    # 使用 BeautifulSoup 解析 HTML
+    soup = BeautifulSoup(html_table, 'html.parser')
+    # 找到 <table> 标签
+    table = soup.find('table')
+    if not table:
+        raise ValueError("输入的 HTML 不包含有效的 <table> 标签")
+    # 创建一个新的 <table> 标签
+    new_table = BeautifulSoup('<table></table>', 'html.parser').table
+    # 提取所有行（包括 <thead> 和 <tbody> 中的行）
+    rows = table.find_all(['tr'], recursive=True)
+    for row in rows:
+        # 创建新的 <tr> 标签
+        new_row = soup.new_tag('tr')
+        # 处理每一行中的单元格
+        cells = row.find_all(['th', 'td'])
+        for cell in cells:
+            # 将 <th> 替换为 <td>
+            new_cell = soup.new_tag('td')
+            new_cell.string = cell.get_text(strip=True)  # 保留单元格内容
+            new_row.append(new_cell)
+        # 将新行添加到新表格中
+        new_table.append(new_row)
+    # 返回简化后的表格 HTML
+    return str(new_table)
+def evaluate(pred, gt):
+    edit_dist = nltk.edit_distance(pred, gt) / max(len(pred), len(gt))
+    return 1.0- edit_dist
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate page_to_markdown task")
+    parser.add_argument(
+        "workspace",
+        help="The filesystem path where work will be stored, can be a local folder",
+    )
+    parser.add_argument(
+        "--gt_file",
+        help="Ground truth file",
+    )
+    parser.add_argument("--n_jobs", type=int, default=40, help="Number of jobs to run in parallel")
+    args = parser.parse_args()
+    pred_data = {}
+    root_dir = os.path.join(args.workspace, "results")
+    for jsonl_file in os.listdir(root_dir):
+        if jsonl_file.endswith(".jsonl"):
+            with open(os.path.join(root_dir, jsonl_file), "r") as f:
+                for line in f:
+                    data = json.loads(line)
+                    pdf_path = data['metadata']['Source-File']
+                    document_text = data['text']
+                    document_text = replace_single_dollar(replace_double_dollar(document_text))
+                    markdown_text_list = document_text.split("\n\n")
+                    new_markdown_text_list = []
+                    for text in markdown_text_list:
+                        html_text = str(markdown2.markdown(text,extras=["tables"]))
+                        html_text = html_text.strip()
+                        if html_text.startswith("<table>") and html_text.endswith("</table>"):
+                            html_table = simplify_html_table(html_text)
+                            new_markdown_text_list.append(html_table)
+                        else:
+                            text = turn_header_to_h1(text)
+                            new_markdown_text_list.append(text)
+                    pred_data[os.path.basename(pdf_path)] = "\n\n".join(new_markdown_text_list)
+    filename_list_en = []
+    filename_list_zh = []
+    gt_data = {}
+    with open(args.gt_file, "r") as f:
+        for line in f:
+            data = json.loads(line)
+            markdown = data['markdown']
+            pdf_name = data['pdf_name']
+            gt_data[pdf_name] = markdown
+            if data['language'] == 'en':
+                filename_list_en.append(pdf_name)
+            else:
+                filename_list_zh.append(pdf_name)
+    keys = list(gt_data.keys())
+    if args.n_jobs == 1:
+        scores = [evaluate(pred_data.get(filename, ''), gt_data.get(filename, '')) for filename in tqdm(keys)]
+    else:
+        inputs = [{'pred': pred_data.get(filename, ''), 'gt': gt_data.get(filename, '')} for filename in keys]
+        scores = parallel_process(inputs, evaluate, use_kwargs=True, n_jobs=args.n_jobs, front_num=1)
+    total_score_en = 0
+    total_num_en = 0
+    total_score_zh = 0
+    total_num_zh = 0
+    for filename, score in zip(keys, scores):
+        if filename in filename_list_en:
+            print(filename)
+            print(score)
+            print()
+            total_score_en += score
+            total_num_en += 1
+        elif filename in filename_list_zh:
+            total_score_zh += score
+            total_num_zh += 1
+    print(f"English: {total_score_en / total_num_en}")
+    print(f"Chinese: {total_score_zh / total_num_zh}")
+    print(f"Total: {sum(scores) / len(scores)}")
+if __name__ == "__main__":
+    main()

eval/eval_table_to_html.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import os
+import json
+import argparse
+import distance
+from apted import APTED, Config
+from apted.helpers import Tree
+from lxml import etree, html
+from collections import deque
+from tqdm import tqdm
+from eval.parallel import parallel_process
+class TableTree(Tree):
+    def __init__(self, tag, colspan=None, rowspan=None, content=None, *children):
+        self.tag = tag
+        self.colspan = colspan
+        self.rowspan = rowspan
+        self.content = content
+        self.children = list(children)
+    def bracket(self):
+        """Show tree using brackets notation"""
+        if self.tag == 'td':
+            result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \
+                     (self.tag, self.colspan, self.rowspan, self.content)
+        else:
+            result = '"tag": %s' % self.tag
+        for child in self.children:
+            result += child.bracket()
+        return "{{{}}}".format(result)
+class CustomConfig(Config):
+    @staticmethod
+    def maximum(*sequences):
+        """Get maximum possible value
+        """
+        return max(map(len, sequences))
+    def normalized_distance(self, *sequences):
+        """Get distance from 0 to 1
+        """
+        return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
+    def rename(self, node1, node2):
+        """Compares attributes of trees"""
+        if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
+            return 1.
+        if node1.tag == 'td':
+            if node1.content or node2.content:
+                return self.normalized_distance(node1.content, node2.content)
+        return 0.
+class TEDS(object):
+    ''' Tree Edit Distance basead Similarity
+    '''
+    def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None):
+        assert isinstance(n_jobs, int) and (n_jobs >= 1), 'n_jobs must be an integer greather than 1'
+        self.structure_only = structure_only
+        self.n_jobs = n_jobs
+        self.ignore_nodes = ignore_nodes
+        self.__tokens__ = []
+    def tokenize(self, node):
+        ''' Tokenizes table cells
+        '''
+        self.__tokens__.append('<%s>' % node.tag)
+        if node.text is not None:
+            self.__tokens__ += list(node.text)
+        for n in node.getchildren():
+            self.tokenize(n)
+        if node.tag != 'unk':
+            self.__tokens__.append('</%s>' % node.tag)
+        if node.tag != 'td' and node.tail is not None:
+            self.__tokens__ += list(node.tail)
+    def load_html_tree(self, node, parent=None):
+        ''' Converts HTML tree to the format required by apted
+        '''
+        global __tokens__
+        if node.tag == 'td':
+            if self.structure_only:
+                cell = []
+            else:
+                self.__tokens__ = []
+                self.tokenize(node)
+                cell = self.__tokens__[1:-1].copy()
+            new_node = TableTree(node.tag,
+                                 int(node.attrib.get('colspan', '1')),
+                                 int(node.attrib.get('rowspan', '1')),
+                                 cell, *deque())
+        else:
+            new_node = TableTree(node.tag, None, None, None, *deque())
+        if parent is not None:
+            parent.children.append(new_node)
+        if node.tag != 'td':
+            for n in node.getchildren():
+                self.load_html_tree(n, new_node)
+        if parent is None:
+            return new_node
+    def evaluate(self, pred, true):
+        ''' Computes TEDS score between the prediction and the ground truth of a
+            given sample
+        '''
+        if (not pred) or (not true):
+            return 0.0
+        pred = "<html>" + pred + "</html>"
+        true = "<html>" + true + "</html>"
+        parser = html.HTMLParser(remove_comments=True, encoding='utf-8')
+        pred = html.fromstring(pred, parser=parser)
+        true = html.fromstring(true, parser=parser)
+        if pred.xpath('body/table') and true.xpath('body/table'):
+            pred = pred.xpath('body/table')[0]
+            true = true.xpath('body/table')[0]
+            if self.ignore_nodes:
+                etree.strip_tags(pred, *self.ignore_nodes)
+                etree.strip_tags(true, *self.ignore_nodes)
+            n_nodes_pred = len(pred.xpath(".//*"))
+            n_nodes_true = len(true.xpath(".//*"))
+            n_nodes = max(n_nodes_pred, n_nodes_true)
+            tree_pred = self.load_html_tree(pred)
+            tree_true = self.load_html_tree(true)
+            distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance()
+            return 1.0 - (float(distance) / n_nodes)
+        else:
+            return 0.0
+    def batch_evaluate(self, pred_json, true_json):
+        ''' Computes TEDS score between the prediction and the ground truth of
+            a batch of samples
+            @params pred_json: {'FILENAME': 'HTML CODE', ...}
+            @params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...}
+            @output: {'FILENAME': 'TEDS SCORE', ...}
+        '''
+        samples = true_json.keys()
+        if self.n_jobs == 1:
+            scores = [self.evaluate(pred_json.get(filename, ''), true_json[filename]['html']) for filename in tqdm(samples)]
+        else:
+            inputs = [{'pred': pred_json.get(filename, ''), 'true': true_json[filename]['html']} for filename in samples]
+            scores = parallel_process(inputs, self.evaluate, use_kwargs=True, n_jobs=self.n_jobs, front_num=1)
+        total_score_simple = 0
+        num_simple = 0
+        total_score_complex = 0
+        num_complex = 0
+        total_score = 0
+        num_total = 0
+        for filename,score in zip(samples, scores):
+            print(filename)
+            print(score)
+            print('')
+            if true_json[filename]['type'] == 'simple':
+                total_score_simple += score
+                num_simple += 1
+            elif true_json[filename]['type'] == 'complex':
+                total_score_complex += score
+                num_complex += 1
+            else:
+                raise ValueError('Unknown type: %s' % true_json[filename]['type'])
+            total_score += score
+            num_total += 1
+        if num_simple > 0:
+            avg_score_simple = total_score_simple / num_simple
+        else:
+            avg_score_simple = 0
+        if num_complex > 0:
+            avg_score_complex = total_score_complex / num_complex
+        else:
+            avg_score_complex = 0
+        avg_score = total_score / num_total
+        print({'simple': (num_simple,avg_score_simple), 'complex': (num_complex,avg_score_complex), 'total': (num_total,avg_score)})
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate page_to_markdown task")
+    parser.add_argument(
+        "workspace",
+        help="The filesystem path where work will be stored, can be a local folder",
+    )
+    parser.add_argument(
+        "--gt_file",
+        help="Ground truth file",
+    )
+    parser.add_argument("--n_jobs", type=int, default=40, help="Number of jobs to run in parallel")
+    args = parser.parse_args()
+    pred_data = {}
+    root_dir = os.path.join(args.workspace, "results")
+    for jsonl_file in os.listdir(root_dir):
+        if jsonl_file.endswith(".jsonl"):
+            with open(os.path.join(root_dir, jsonl_file), "r") as f:
+                for line in f:
+                    data = json.loads(line)
+                    pred_data[os.path.basename(data['orig_path'])] = data['document_text']
+    gt_data = {}
+    with open(args.gt_file, "r") as f:
+        for line in f:
+            data = json.loads(line)
+            gt_data[data['image_name']] = {'html':data['gt_table'], 'type':data['type']}
+    teds = TEDS(n_jobs=args.n_jobs, ignore_nodes=['b', 'thead', 'tbody'])
+    teds.batch_evaluate(pred_data, gt_data)
+if __name__ == "__main__":
+    main()

eval/eval_table_to_html_nanonets.py ADDED Viewed

	@@ -0,0 +1,295 @@

+import os
+import json
+import argparse
+import distance
+import markdown2
+import re
+from apted import APTED, Config
+from apted.helpers import Tree
+from lxml import etree, html
+from collections import deque
+from tqdm import tqdm
+from eval.parallel import parallel_process
+from bs4 import BeautifulSoup
+def turn_header_to_h1(line):
+    # 检查是否是以一个或多个 '#' 开头的标题行
+    if line.lstrip().startswith('#'):
+        # 去掉开头的 '#' 和其后的空格
+        new_line = "# " + line.lstrip().lstrip('#').lstrip()
+        return new_line
+    else:
+        return line
+def replace_single_dollar(markdown_text):
+    pattern = r'\$(.*?)\$'
+    def replace_with_brackets(match):
+        formula_content = match.group(1)  # 获取匹配到的公式内容
+        return f'\\({formula_content}\\)'
+    replaced_text = re.sub(pattern, replace_with_brackets, markdown_text, flags=re.DOTALL)
+    return replaced_text
+def replace_double_dollar(markdown_text):
+    pattern = r'\$\$(.*?)\$\$'
+    def replace_with_brackets(match):
+        formula_content = match.group(1)
+        return f'\\[{formula_content}\\]'
+    replaced_text = re.sub(pattern, replace_with_brackets, markdown_text, flags=re.DOTALL)
+    return replaced_text
+class TableTree(Tree):
+    def __init__(self, tag, colspan=None, rowspan=None, content=None, *children):
+        self.tag = tag
+        self.colspan = colspan
+        self.rowspan = rowspan
+        self.content = content
+        self.children = list(children)
+    def bracket(self):
+        """Show tree using brackets notation"""
+        if self.tag == 'td':
+            result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \
+                     (self.tag, self.colspan, self.rowspan, self.content)
+        else:
+            result = '"tag": %s' % self.tag
+        for child in self.children:
+            result += child.bracket()
+        return "{{{}}}".format(result)
+class CustomConfig(Config):
+    @staticmethod
+    def maximum(*sequences):
+        """Get maximum possible value
+        """
+        return max(map(len, sequences))
+    def normalized_distance(self, *sequences):
+        """Get distance from 0 to 1
+        """
+        return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
+    def rename(self, node1, node2):
+        """Compares attributes of trees"""
+        if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
+            return 1.
+        if node1.tag == 'td':
+            if node1.content or node2.content:
+                return self.normalized_distance(node1.content, node2.content)
+        return 0.
+class TEDS(object):
+    ''' Tree Edit Distance basead Similarity
+    '''
+    def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None):
+        assert isinstance(n_jobs, int) and (n_jobs >= 1), 'n_jobs must be an integer greather than 1'
+        self.structure_only = structure_only
+        self.n_jobs = n_jobs
+        self.ignore_nodes = ignore_nodes
+        self.__tokens__ = []
+    def tokenize(self, node):
+        ''' Tokenizes table cells
+        '''
+        self.__tokens__.append('<%s>' % node.tag)
+        if node.text is not None:
+            self.__tokens__ += list(node.text)
+        for n in node.getchildren():
+            self.tokenize(n)
+        if node.tag != 'unk':
+            self.__tokens__.append('</%s>' % node.tag)
+        if node.tag != 'td' and node.tail is not None:
+            self.__tokens__ += list(node.tail)
+    def load_html_tree(self, node, parent=None):
+        ''' Converts HTML tree to the format required by apted
+        '''
+        global __tokens__
+        if node.tag == 'td':
+            if self.structure_only:
+                cell = []
+            else:
+                self.__tokens__ = []
+                self.tokenize(node)
+                cell = self.__tokens__[1:-1].copy()
+            new_node = TableTree(node.tag,
+                                 int(node.attrib.get('colspan', '1')),
+                                 int(node.attrib.get('rowspan', '1')),
+                                 cell, *deque())
+        else:
+            new_node = TableTree(node.tag, None, None, None, *deque())
+        if parent is not None:
+            parent.children.append(new_node)
+        if node.tag != 'td':
+            for n in node.getchildren():
+                self.load_html_tree(n, new_node)
+        if parent is None:
+            return new_node
+    def evaluate(self, pred, true):
+        ''' Computes TEDS score between the prediction and the ground truth of a
+            given sample
+        '''
+        if (not pred) or (not true):
+            return 0.0
+        pred.replace("<th>","<td>")
+        pred.replace("</th>","</td>")
+        pred = "<html>" + pred + "</html>"
+        true = "<html>" + true + "</html>"
+        parser = html.HTMLParser(remove_comments=True, encoding='utf-8')
+        pred = html.fromstring(pred, parser=parser)
+        true = html.fromstring(true, parser=parser)
+        if pred.xpath('body/table') and true.xpath('body/table'):
+            pred = pred.xpath('body/table')[0]
+            true = true.xpath('body/table')[0]
+            if self.ignore_nodes:
+                etree.strip_tags(pred, *self.ignore_nodes)
+                etree.strip_tags(true, *self.ignore_nodes)
+            n_nodes_pred = len(pred.xpath(".//*"))
+            n_nodes_true = len(true.xpath(".//*"))
+            n_nodes = max(n_nodes_pred, n_nodes_true)
+            tree_pred = self.load_html_tree(pred)
+            tree_true = self.load_html_tree(true)
+            distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance()
+            return 1.0 - (float(distance) / n_nodes)
+        else:
+            return 0.0
+    def batch_evaluate(self, pred_json, true_json):
+        ''' Computes TEDS score between the prediction and the ground truth of
+            a batch of samples
+            @params pred_json: {'FILENAME': 'HTML CODE', ...}
+            @params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...}
+            @output: {'FILENAME': 'TEDS SCORE', ...}
+        '''
+        samples = true_json.keys()
+        if self.n_jobs == 1:
+            scores = [self.evaluate(pred_json.get(filename, ''), true_json[filename]['html']) for filename in tqdm(samples)]
+        else:
+            inputs = [{'pred': pred_json.get(filename, ''), 'true': true_json[filename]['html']} for filename in samples]
+            scores = parallel_process(inputs, self.evaluate, use_kwargs=True, n_jobs=self.n_jobs, front_num=1)
+        total_score_simple = 0
+        num_simple = 0
+        total_score_complex = 0
+        num_complex = 0
+        total_score = 0
+        num_total = 0
+        for filename,score in zip(samples, scores):
+            print(filename)
+            print(score)
+            print('')
+            if true_json[filename]['type'] == 'simple':
+                total_score_simple += score
+                num_simple += 1
+            elif true_json[filename]['type'] == 'complex':
+                total_score_complex += score
+                num_complex += 1
+            else:
+                raise ValueError('Unknown type: %s' % true_json[filename]['type'])
+            total_score += score
+            num_total += 1
+        if num_simple > 0:
+            avg_score_simple = total_score_simple / num_simple
+        else:
+            avg_score_simple = 0
+        if num_complex > 0:
+            avg_score_complex = total_score_complex / num_complex
+        else:
+            avg_score_complex = 0
+        avg_score = total_score / num_total
+        print({'simple': (num_simple,avg_score_simple), 'complex': (num_complex,avg_score_complex), 'total': (num_total,avg_score)})
+def simplify_html_table(html_table):
+    # 使用 BeautifulSoup 解析 HTML
+    soup = BeautifulSoup(html_table, 'html.parser')
+    # 找到 <table> 标签
+    table = soup.find('table')
+    if not table:
+        raise ValueError("输入的 HTML 不包含有效的 <table> 标签")
+    # 创建一个新的 <table> 标签
+    new_table = BeautifulSoup('<table></table>', 'html.parser').table
+    # 提取所有行（包括 <thead> 和 <tbody> 中的行）
+    rows = table.find_all(['tr'], recursive=True)
+    for row in rows:
+        # 创建新的 <tr> 标签
+        new_row = soup.new_tag('tr')
+        # 处理每一行中的单元格
+        cells = row.find_all(['th', 'td'])
+        for cell in cells:
+            # 将 <th> 替换为 <td>
+            new_cell = soup.new_tag('td')
+            if cell.has_attr('rowspan'):
+                new_cell['rowspan'] = cell['rowspan']
+            if cell.has_attr('colspan'):
+                new_cell['colspan'] = cell['colspan']
+            new_cell.string = cell.get_text(strip=True)  # 保留单元格内容
+            new_row.append(new_cell)
+        # 将新行添加到新表格中
+        new_table.append(new_row)
+    # 返回简化后的表格 HTML
+    return str(new_table)
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate page_to_markdown task")
+    parser.add_argument(
+        "workspace",
+        help="The filesystem path where work will be stored, can be a local folder",
+    )
+    parser.add_argument(
+        "--gt_file",
+        help="Ground truth file",
+    )
+    parser.add_argument("--n_jobs", type=int, default=40, help="Number of jobs to run in parallel")
+    args = parser.parse_args()
+    pred_data = {}
+    for file in os.listdir(args.workspace):
+        file_path = os.path.join(args.workspace, file)
+        pdf_name = file.split('.')[0] + ".png"
+        with open(file_path, "r") as f:
+            document_text = f.read()
+            document_text = replace_single_dollar(replace_double_dollar(document_text))
+            markdown_text_list = document_text.split("\n\n")
+            new_markdown_text_list = []
+            for text in markdown_text_list:
+                text = text.strip()
+                if (text.startswith("<watermark>") and text.endswith("</watermark>")) or (text.startswith("<img>") and text.endswith("</img>")) or (text.startswith("<page_number>") and text.endswith("</page_number>")) or (text.startswith("<signature>") and text.endswith("</signature>")):
+                    continue
+                else:
+                    html_text = str(markdown2.markdown(text,extras=["tables"]))
+                    html_text = html_text.strip()
+                    if html_text.startswith("<table>") and html_text.endswith("</table>"):
+                        html_table = simplify_html_table(html_text)
+                        new_markdown_text_list.append(html_table)
+                    else:
+                        text = turn_header_to_h1(text)
+                        new_markdown_text_list.append(text)
+            pred_data[os.path.basename(pdf_name)] = "\n\n".join(new_markdown_text_list)
+    gt_data = {}
+    with open(args.gt_file, "r") as f:
+        for line in f:
+            data = json.loads(line)
+            gt_data[data['image_name']] = {'html':data['gt_table'], 'type':data['type']}
+    teds = TEDS(n_jobs=args.n_jobs, ignore_nodes=['b', 'thead', 'tbody'])
+    teds.batch_evaluate(pred_data, gt_data)
+if __name__ == "__main__":
+    main()

eval/eval_table_to_html_olmocr.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import os
+import json
+import argparse
+import distance
+import markdown2
+from apted import APTED, Config
+from apted.helpers import Tree
+from lxml import etree, html
+from collections import deque
+from tqdm import tqdm
+from eval.parallel import parallel_process
+class TableTree(Tree):
+    def __init__(self, tag, colspan=None, rowspan=None, content=None, *children):
+        self.tag = tag
+        self.colspan = colspan
+        self.rowspan = rowspan
+        self.content = content
+        self.children = list(children)
+    def bracket(self):
+        """Show tree using brackets notation"""
+        if self.tag == 'td':
+            result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \
+                     (self.tag, self.colspan, self.rowspan, self.content)
+        else:
+            result = '"tag": %s' % self.tag
+        for child in self.children:
+            result += child.bracket()
+        return "{{{}}}".format(result)
+class CustomConfig(Config):
+    @staticmethod
+    def maximum(*sequences):
+        """Get maximum possible value
+        """
+        return max(map(len, sequences))
+    def normalized_distance(self, *sequences):
+        """Get distance from 0 to 1
+        """
+        return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
+    def rename(self, node1, node2):
+        """Compares attributes of trees"""
+        if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
+            return 1.
+        if node1.tag == 'td':
+            if node1.content or node2.content:
+                return self.normalized_distance(node1.content, node2.content)
+        return 0.
+class TEDS(object):
+    ''' Tree Edit Distance basead Similarity
+    '''
+    def __init__(self, structure_only=False, n_jobs=1, ignore_nodes=None):
+        assert isinstance(n_jobs, int) and (n_jobs >= 1), 'n_jobs must be an integer greather than 1'
+        self.structure_only = structure_only
+        self.n_jobs = n_jobs
+        self.ignore_nodes = ignore_nodes
+        self.__tokens__ = []
+    def tokenize(self, node):
+        ''' Tokenizes table cells
+        '''
+        self.__tokens__.append('<%s>' % node.tag)
+        if node.text is not None:
+            self.__tokens__ += list(node.text)
+        for n in node.getchildren():
+            self.tokenize(n)
+        if node.tag != 'unk':
+            self.__tokens__.append('</%s>' % node.tag)
+        if node.tag != 'td' and node.tail is not None:
+            self.__tokens__ += list(node.tail)
+    def load_html_tree(self, node, parent=None):
+        ''' Converts HTML tree to the format required by apted
+        '''
+        global __tokens__
+        if node.tag == 'td':
+            if self.structure_only:
+                cell = []
+            else:
+                self.__tokens__ = []
+                self.tokenize(node)
+                cell = self.__tokens__[1:-1].copy()
+            new_node = TableTree(node.tag,
+                                 int(node.attrib.get('colspan', '1')),
+                                 int(node.attrib.get('rowspan', '1')),
+                                 cell, *deque())
+        else:
+            new_node = TableTree(node.tag, None, None, None, *deque())
+        if parent is not None:
+            parent.children.append(new_node)
+        if node.tag != 'td':
+            for n in node.getchildren():
+                self.load_html_tree(n, new_node)
+        if parent is None:
+            return new_node
+    def evaluate(self, pred, true):
+        ''' Computes TEDS score between the prediction and the ground truth of a
+            given sample
+        '''
+        if (not pred) or (not true):
+            return 0.0
+        pred.replace("<th>","<td>")
+        pred.replace("</th>","</td>")
+        pred = "<html>" + pred + "</html>"
+        true = "<html>" + true + "</html>"
+        parser = html.HTMLParser(remove_comments=True, encoding='utf-8')
+        pred = html.fromstring(pred, parser=parser)
+        true = html.fromstring(true, parser=parser)
+        if pred.xpath('body/table') and true.xpath('body/table'):
+            pred = pred.xpath('body/table')[0]
+            true = true.xpath('body/table')[0]
+            if self.ignore_nodes:
+                etree.strip_tags(pred, *self.ignore_nodes)
+                etree.strip_tags(true, *self.ignore_nodes)
+            n_nodes_pred = len(pred.xpath(".//*"))
+            n_nodes_true = len(true.xpath(".//*"))
+            n_nodes = max(n_nodes_pred, n_nodes_true)
+            tree_pred = self.load_html_tree(pred)
+            tree_true = self.load_html_tree(true)
+            distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance()
+            return 1.0 - (float(distance) / n_nodes)
+        else:
+            return 0.0
+    def batch_evaluate(self, pred_json, true_json):
+        ''' Computes TEDS score between the prediction and the ground truth of
+            a batch of samples
+            @params pred_json: {'FILENAME': 'HTML CODE', ...}
+            @params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...}
+            @output: {'FILENAME': 'TEDS SCORE', ...}
+        '''
+        samples = true_json.keys()
+        if self.n_jobs == 1:
+            scores = [self.evaluate(pred_json.get(filename, ''), true_json[filename]['html']) for filename in tqdm(samples)]
+        else:
+            inputs = [{'pred': pred_json.get(filename, ''), 'true': true_json[filename]['html']} for filename in samples]
+            scores = parallel_process(inputs, self.evaluate, use_kwargs=True, n_jobs=self.n_jobs, front_num=1)
+        total_score_simple = 0
+        num_simple = 0
+        total_score_complex = 0
+        num_complex = 0
+        total_score = 0
+        num_total = 0
+        for filename,score in zip(samples, scores):
+            print(filename)
+            print(score)
+            print('')
+            if true_json[filename]['type'] == 'simple':
+                total_score_simple += score
+                num_simple += 1
+            elif true_json[filename]['type'] == 'complex':
+                total_score_complex += score
+                num_complex += 1
+            else:
+                raise ValueError('Unknown type: %s' % true_json[filename]['type'])
+            total_score += score
+            num_total += 1
+        if num_simple > 0:
+            avg_score_simple = total_score_simple / num_simple
+        else:
+            avg_score_simple = 0
+        if num_complex > 0:
+            avg_score_complex = total_score_complex / num_complex
+        else:
+            avg_score_complex = 0
+        avg_score = total_score / num_total
+        print({'simple': (num_simple,avg_score_simple), 'complex': (num_complex,avg_score_complex), 'total': (num_total,avg_score)})
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate page_to_markdown task")
+    parser.add_argument(
+        "workspace",
+        help="The filesystem path where work will be stored, can be a local folder",
+    )
+    parser.add_argument(
+        "--gt_file",
+        help="Ground truth file",
+    )
+    parser.add_argument("--n_jobs", type=int, default=40, help="Number of jobs to run in parallel")
+    args = parser.parse_args()
+    pred_data = {}
+    root_dir = os.path.join(args.workspace, "results")
+    for jsonl_file in os.listdir(root_dir):
+        if jsonl_file.endswith(".jsonl"):
+            with open(os.path.join(root_dir, jsonl_file), "r") as f:
+                for line in f:
+                    data = json.loads(line)
+                    pdf_path = os.path.basename(data['metadata']['Source-File'])
+                    document_text = data['text']
+                    pred_data[pdf_path] = str(markdown2.markdown(document_text,extras=["tables"]))
+    gt_data = {}
+    with open(args.gt_file, "r") as f:
+        for line in f:
+            data = json.loads(line)
+            gt_data[data['image_name']] = {'html':data['gt_table'], 'type':data['type']}
+    teds = TEDS(n_jobs=args.n_jobs, ignore_nodes=['b', 'thead', 'tbody'])
+    teds.batch_evaluate(pred_data, gt_data)
+if __name__ == "__main__":
+    main()

eval/gen_element_merge_detect_data.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import os
+import json
+import argparse
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate element_merge_detect task")
+    parser.add_argument(
+        "workspace",
+        help="The filesystem path where work will be stored, can be a local folder",
+    )
+    args = parser.parse_args()
+    json_dir = os.path.join(args.workspace, 'jsons')
+    if not os.path.exists(json_dir):
+        os.makedirs(json_dir)
+    jsonl_file = os.path.join(args.workspace, "data.jsonl")
+    with open(jsonl_file, "r") as f:
+        for line in f:
+            data = json.loads(line)
+            pdf_name_1 = data['pdf_name_1'].split(".")[0]
+            pdf_name_2 = data['pdf_name_2'].split(".")[0]
+            pdf_name,page_1 = pdf_name_1.split('_')
+            pdf_name,page_2 = pdf_name_2.split('_')
+            json_name = os.path.join(json_dir, pdf_name + '_' + page_1 + '_' + page_2 + '.json')
+            data = {
+                "page_1": "\n\n".join(data['md_elem_list_1']),
+                "page_2": "\n\n".join(data['md_elem_list_2']),
+            }
+            with open(json_name, 'w') as f:
+                json.dump(data, f)
+if __name__ == "__main__":
+    main()

eval/gen_html_table_merge_data.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import os
+import json
+import argparse
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate element_merge_detect task")
+    parser.add_argument(
+        "workspace",
+        help="The filesystem path where work will be stored, can be a local folder",
+    )
+    args = parser.parse_args()
+    json_dir = os.path.join(args.workspace, 'jsons')
+    if not os.path.exists(json_dir):
+        os.makedirs(json_dir)
+    jsonl_file = os.path.join(args.workspace, 'data.jsonl')
+    with open(jsonl_file, "r") as f:
+        for line in f:
+            data = json.loads(line)
+            json_name = data['image_name'].split('.')[0] + '.json'
+            json_path = os.path.join(json_dir, json_name)
+            data = {
+                "table_1": data['table_fragment_1'],
+                "table_2": data['table_fragment_2'],
+            }
+            with open(json_path, 'w') as f:
+                json.dump(data, f)
+if __name__ == "__main__":
+    main()

eval/parallel.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from tqdm import tqdm
+from concurrent.futures import ProcessPoolExecutor, as_completed
+def parallel_process(array, function, n_jobs=16, use_kwargs=False, front_num=0):
+    """
+        A parallel version of the map function with a progress bar.
+        Args:
+            array (array-like): An array to iterate over.
+            function (function): A python function to apply to the elements of array
+            n_jobs (int, default=16): The number of cores to use
+            use_kwargs (boolean, default=False): Whether to consider the elements of array as dictionaries of
+                keyword arguments to function
+            front_num (int, default=3): The number of iterations to run serially before kicking off the parallel job.
+                Useful for catching bugs
+        Returns:
+            [function(array[0]), function(array[1]), ...]
+    """
+    # We run the first few iterations serially to catch bugs
+    if front_num > 0:
+        front = [function(**a) if use_kwargs else function(a) for a in array[:front_num]]
+    else:
+        front = []
+    # If we set n_jobs to 1, just run a list comprehension. This is useful for benchmarking and debugging.
+    if n_jobs == 1:
+        return front + [function(**a) if use_kwargs else function(a) for a in tqdm(array[front_num:])]
+    # Assemble the workers
+    with ProcessPoolExecutor(max_workers=n_jobs) as pool:
+        # Pass the elements of array into function
+        if use_kwargs:
+            futures = [pool.submit(function, **a) for a in array[front_num:]]
+        else:
+            futures = [pool.submit(function, a) for a in array[front_num:]]
+        kwargs = {
+            'total': len(futures),
+            'unit': 'it',
+            'unit_scale': True,
+            'leave': True
+        }
+        # Print out the progress as tasks complete
+        for f in tqdm(as_completed(futures), **kwargs):
+            pass
+    out = []
+    # Get the results from the futures.
+    for i, future in tqdm(enumerate(futures)):
+        try:
+            out.append(future.result())
+        except Exception as e:
+            out.append(e)
+    return front + out

ocrflux/check.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import importlib.util
+import logging
+import subprocess
+import sys
+logger = logging.getLogger(__name__)
+def check_poppler_version():
+    try:
+        result = subprocess.run(["pdftoppm", "-h"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        if result.returncode == 0 and result.stderr.startswith("pdftoppm"):
+            logger.info("pdftoppm is installed and working.")
+        else:
+            logger.error("pdftoppm is installed but returned an error.")
+            sys.exit(1)
+    except FileNotFoundError:
+        logger.error("pdftoppm is not installed.")
+        sys.exit(1)
+def check_vllm_version():
+    if importlib.util.find_spec("vllm") is None:
+        logger.error("VLLM needs to be installed with a separate command in order to find all dependencies properly.")
+        sys.exit(1)
+def check_torch_gpu_available(min_gpu_memory: int = 20 * 1024**3):
+    try:
+        import torch
+    except:
+        logger.error("Pytorch must be installed, visit https://pytorch.org/ for installation instructions")
+        raise
+    try:
+        gpu_memory = torch.cuda.get_device_properties(0).total_memory
+        assert gpu_memory >= min_gpu_memory
+    except:
+        logger.error(f"Torch was not able to find a GPU with at least {min_gpu_memory // (1024 ** 3)} GB of RAM.")
+        raise
+if __name__ == "__main__":
+    check_poppler_version()
+    check_vllm_version()

ocrflux/image_utils.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+import subprocess
+import io
+from typing import List, Union
+from PIL import Image
+def get_page_image(pdf_path, page_number, target_longest_image_dim=None, image_rotation=0):
+    if pdf_path.lower().endswith(".pdf"):
+        # Convert PDF page to PNG using pdftoppm
+        pdftoppm_result = subprocess.run(
+            [
+                "pdftoppm",
+                "-png",
+                "-f",
+                str(page_number),
+                "-l",
+                str(page_number),
+                "-r",
+                "72",  # 72 pixels per point is the conversion factor
+                pdf_path,
+            ],
+            timeout=120,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        assert pdftoppm_result.returncode == 0, pdftoppm_result.stderr
+        image = Image.open(io.BytesIO(pdftoppm_result.stdout))
+    else:
+        image = Image.open(pdf_path)
+    if image_rotation != 0:
+        image = image.rotate(-image_rotation, expand=True)
+    if target_longest_image_dim is not None:
+        width, height = image.size
+        if width > height:
+            new_width = target_longest_image_dim
+            new_height = int(height * (target_longest_image_dim / width))
+        else:
+            new_height = target_longest_image_dim
+            new_width = int(width * (target_longest_image_dim / height))
+        image = image.resize((new_width, new_height))
+    return image
+def is_image(file_path):
+    try:
+        Image.open(file_path)
+        return True
+    except:
+        return False

ocrflux/inference.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import json
+import copy
+from PIL import Image
+from pypdf import PdfReader
+from vllm import LLM, SamplingParams
+from ocrflux.image_utils import get_page_image
+from ocrflux.table_format import table_matrix2html
+from ocrflux.prompts import PageResponse, build_page_to_markdown_prompt, build_element_merge_detect_prompt, build_html_table_merge_prompt
+def build_qwen2_5_vl_prompt(question):
+    return (
+            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+            f"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+            f"{question}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+    )
+def build_page_to_markdown_query(file_path: str, page_number: int, target_longest_image_dim: int = 1024, image_rotation: int = 0) -> dict:
+    assert image_rotation in [0, 90, 180, 270], "Invalid image rotation provided in build_page_query"
+    image = get_page_image(file_path, page_number, target_longest_image_dim=target_longest_image_dim, image_rotation=image_rotation)
+    question = build_page_to_markdown_prompt()
+    prompt = build_qwen2_5_vl_prompt(question)
+    query = {
+        "prompt": prompt,
+        "multi_modal_data": {"image": image},
+    }
+    return query
+def build_element_merge_detect_query(text_list_1,text_list_2) -> dict:
+    image = Image.new('RGB', (28, 28), color='black')
+    question = build_element_merge_detect_prompt(text_list_1,text_list_2)
+    prompt = build_qwen2_5_vl_prompt(question)
+    query = {
+        "prompt": prompt,
+        "multi_modal_data": {"image": image},
+    }
+    return query
+def build_html_table_merge_query(text_1,text_2) -> dict:
+    image = Image.new('RGB', (28, 28), color='black')
+    question = build_html_table_merge_prompt(text_1,text_2)
+    prompt = build_qwen2_5_vl_prompt(question)
+    query = {
+        "prompt": prompt,
+        "multi_modal_data": {"image": image},
+    }
+    return query
+def bulid_document_text(page_to_markdown_result, element_merge_detect_result, html_table_merge_result):
+    page_to_markdown_keys = list(page_to_markdown_result.keys())
+    element_merge_detect_keys = list(element_merge_detect_result.keys())
+    html_table_merge_keys = list(html_table_merge_result.keys())
+    for page_1,page_2,elem_idx_1,elem_idx_2 in sorted(html_table_merge_keys,key=lambda x: -x[0]):
+        page_to_markdown_result[page_1][elem_idx_1] = html_table_merge_result[(page_1,page_2,elem_idx_1,elem_idx_2)]
+        page_to_markdown_result[page_2][elem_idx_2] = ''
+    for page_1,page_2 in sorted(element_merge_detect_keys,key=lambda x: -x[0]):
+        for elem_idx_1,elem_idx_2 in element_merge_detect_result[(page_1,page_2)]:
+            if len(page_to_markdown_result[page_1][elem_idx_1]) == 0 or page_to_markdown_result[page_1][elem_idx_1][-1] == '-' or ('\u4e00' <= page_to_markdown_result[page_1][elem_idx_1][-1] <= '\u9fff'):
+                page_to_markdown_result[page_1][elem_idx_1] = page_to_markdown_result[page_1][elem_idx_1] + '' + page_to_markdown_result[page_2][elem_idx_2]
+            else:
+                page_to_markdown_result[page_1][elem_idx_1] = page_to_markdown_result[page_1][elem_idx_1] + ' ' + page_to_markdown_result[page_2][elem_idx_2]
+            page_to_markdown_result[page_2][elem_idx_2] = ''
+    document_text_list = []
+    for page in page_to_markdown_keys:
+        page_text_list = [s for s in page_to_markdown_result[page] if s]
+        document_text_list += page_text_list
+    return "\n\n".join(document_text_list)
+def parse(llm,file_path,skip_cross_page_merge=False,max_page_retries=0):
+    sampling_params = SamplingParams(temperature=0.0,max_tokens=8192)
+    if file_path.lower().endswith(".pdf"):
+        try:
+            reader = PdfReader(file_path)
+            num_pages = reader.get_num_pages()
+        except:
+            return None
+    else:
+        num_pages = 1
+    try:
+        # Stage 1: Page to Markdown
+        page_to_markdown_query_list = [build_page_to_markdown_query(file_path,page_num) for page_num in range(1, num_pages + 1)]
+        responses = llm.generate(page_to_markdown_query_list, sampling_params=sampling_params)
+        results = [response.outputs[0].text for response in responses]
+        page_to_markdown_result = {}
+        retry_list = []
+        for i,result in enumerate(results):
+            try:
+                json_data = json.loads(result)
+                page_response = PageResponse(**json_data)
+                natural_text = page_response.natural_text
+                markdown_element_list = []
+                for text in natural_text.split('\n\n'):
+                    if text.startswith("<Image>") and text.endswith("</Image>"):
+                        pass
+                    elif text.startswith("<table>") and text.endswith("</table>"):
+                        try:
+                            new_text = table_matrix2html(text)
+                        except:
+                            new_text = text.replace("<t>","").replace("<l>","").replace("<lt>","")
+                        markdown_element_list.append(new_text)
+                    else:
+                        markdown_element_list.append(text)
+                page_to_markdown_result[i+1] = markdown_element_list
+            except:
+                retry_list.append(i)
+        attempt = 0
+        while len(retry_list) > 0 and attempt < max_page_retries:
+            retry_page_to_markdown_query_list = [build_page_to_markdown_query(file_path,page_num) for page_num in retry_list]
+            retry_sampling_params = SamplingParams(temperature=0.1*attempt, max_tokens=8192)
+            responses = llm.generate(retry_page_to_markdown_query_list, sampling_params=retry_sampling_params)
+            results = [response.outputs[0].text for response in responses]
+            next_retry_list = []
+            for i,result in zip(retry_list,results):
+                try:
+                    json_data = json.loads(result)
+                    page_response = PageResponse(**json_data)
+                    natural_text = page_response.natural_text
+                    markdown_element_list = []
+                    for text in natural_text.split('\n\n'):
+                        if text.startswith("<Image>") and text.endswith("</Image>"):
+                            pass
+                        elif text.startswith("<table>") and text.endswith("</table>"):
+                            try:
+                                new_text = table_matrix2html(text)
+                            except:
+                                new_text = text.replace("<t>","").replace("<l>","").replace("<lt>","")
+                            markdown_element_list.append(new_text)
+                        else:
+                            markdown_element_list.append(text)
+                    page_to_markdown_result[i+1] = markdown_element_list
+                except:
+                    next_retry_list.append(i)
+            retry_list = next_retry_list
+            attempt += 1
+        page_texts = {}
+        fallback_pages = []
+        for page_number in range(1, num_pages+1):
+            if page_number not in page_to_markdown_result.keys():
+                fallback_pages.append(page_number-1)
+            else:
+                page_texts[str(page_number-1)] = "\n\n".join(page_to_markdown_result[page_number])
+        if skip_cross_page_merge:
+            document_text_list = []
+            for i in range(num_pages):
+                if i not in fallback_pages:
+                    document_text_list.append(page_texts[str(i)])
+            document_text = "\n\n".join(document_text_list)
+            return {
+                "orig_path": file_path,
+                "num_pages": num_pages,
+                "document_text": document_text,
+                "page_texts": page_texts,
+                "fallback_pages": fallback_pages,
+            }
+        # Stage 2: Element Merge Detect
+        element_merge_detect_keys = []
+        element_merge_detect_query_list = []
+        for page_num in range(1,num_pages):
+            if page_num in page_to_markdown_result.keys() and page_num+1 in page_to_markdown_result.keys():
+                element_merge_detect_query_list.append(build_element_merge_detect_query(page_to_markdown_result[page_num],page_to_markdown_result[page_num+1]))
+                element_merge_detect_keys.append((page_num,page_num+1))
+        responses = llm.generate(element_merge_detect_query_list, sampling_params=sampling_params)
+        results = [response.outputs[0].text for response in responses]
+        element_merge_detect_result = {}
+        for key,result in zip(element_merge_detect_keys,results):
+            try:
+                element_merge_detect_result[key] = eval(result)
+            except:
+                pass
+        # Stage 3: HTML Table Merge
+        html_table_merge_keys = []
+        for key,result in element_merge_detect_result.items():
+            page_1,page_2 = key
+            for elem_idx_1,elem_idx_2 in result:
+                text_1 = page_to_markdown_result[page_1][elem_idx_1]
+                text_2 = page_to_markdown_result[page_2][elem_idx_2]
+                if text_1.startswith("<table>") and text_1.endswith("</table>") and text_2.startswith("<table>") and text_2.endswith("</table>"):
+                    html_table_merge_keys.append((page_1,page_2,elem_idx_1,elem_idx_2))
+        html_table_merge_keys = sorted(html_table_merge_keys,key=lambda x: -x[0])
+        html_table_merge_result = {}
+        page_to_markdown_result_tmp = copy.deepcopy(page_to_markdown_result)
+        i = 0
+        while i < len(html_table_merge_keys):
+            tmp = set()
+            keys = []
+            while i < len(html_table_merge_keys):
+                page_1,page_2,elem_idx_1,elem_idx_2 = html_table_merge_keys[i]
+                if (page_2,elem_idx_2) in tmp:
+                    break
+                tmp.add((page_1,elem_idx_1))
+                keys.append((page_1,page_2,elem_idx_1,elem_idx_2))
+                i += 1
+            html_table_merge_query_list = [build_html_table_merge_query(page_to_markdown_result_tmp[page_1][elem_idx_1],page_to_markdown_result_tmp[page_2][elem_idx_2]) for page_1,page_2,elem_idx_1,elem_idx_2 in keys]
+            responses = llm.generate(html_table_merge_query_list, sampling_params=sampling_params)
+            results = [response.outputs[0].text for response in responses]
+            for key,result in zip(keys,results):
+                if result.startswith("<table>") and result.endswith("</table>"):
+                    html_table_merge_result[key] = result
+                    page_to_markdown_result_tmp[page_1][elem_idx_1] = result
+        document_text = bulid_document_text(page_to_markdown_result, element_merge_detect_result, html_table_merge_result)
+        return {
+            "orig_path": file_path,
+            "num_pages": num_pages,
+            "document_text": document_text,
+            "page_texts": page_texts,
+            "fallback_pages": fallback_pages,
+        }
+    except:
+        return None
+if __name__ == '__main__':
+    file_path = 'test.pdf'
+    llm = LLM(model="ChatDOC/OCRFlux-3B",gpu_memory_utilization=0.8,max_model_len=8192)
+    result = parse(llm,file_path,max_page_retries=4)
+    if result != None:
+        document_markdown = result['document_text']
+        print(document_markdown)
+        with open('test.md','w') as f:
+            f.write(document_markdown)
+    else:
+        print("Parse failed")

ocrflux/jsonl_to_markdown.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import os
+import json
+import argparse
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate page_to_markdown task")
+    parser.add_argument(
+        "workspace",
+        help="The filesystem path where work will be stored, can be a local folder",
+    )
+    parser.add_argument("--show_page_result", action="store_true", help="Whether to show the markdown of each page")
+    args = parser.parse_args()
+    src_dir = os.path.join(args.workspace, "results")
+    tgt_dir = os.path.join(args.workspace, "markdowns")
+    if not os.path.exists(tgt_dir):
+        os.makedirs(tgt_dir)
+    for jsonl_file in os.listdir(src_dir):
+        if jsonl_file.endswith(".jsonl"):
+            with open(os.path.join(src_dir, jsonl_file), "r") as f:
+                for line in f:
+                    data = json.loads(line)
+                    markdown_text = data['document_text']
+                    file_name = os.path.basename(data['orig_path']).split(".")[0]
+                    file_dir = os.path.join(tgt_dir, file_name)
+                    if not os.path.exists(file_dir):
+                        os.makedirs(file_dir)
+                    with open(os.path.join(file_dir, file_name+".md"), "w") as f:
+                        f.write(markdown_text)
+                    if args.show_page_result:
+                        page_texts = data["page_texts"]
+                        for page_num in page_texts.keys():
+                            page_text = page_texts[page_num]
+                            with open(os.path.join(file_dir, file_name+"_"+str(page_num)+".md"), "w") as f:
+                                f.write(page_text)
+if __name__ == "__main__":
+    main()

ocrflux/metrics.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import asyncio
+import time
+from collections import defaultdict, deque
+from typing import Any, Deque, Dict, List, Set
+class MetricsKeeper:
+    def __init__(self, window=60 * 5):
+        """
+        Initializes the MetricsKeeper.
+        Args:
+            window (int): Time window in seconds for recent metrics. Defaults to 5 minutes.
+        """
+        self.window = window  # Time window in seconds
+        self.start_time = time.time()  # Timestamp when MetricsKeeper was created
+        self.total_metrics = defaultdict(int)  # Cumulative metrics since start
+        self.window_metrics: Deque[Any] = deque()  # Deque to store (timestamp, metrics_dict)
+        self.window_sum = defaultdict(int)  # Sum of metrics within the window
+    def add_metrics(self, **kwargs):
+        """
+        Adds metrics to the keeper.
+        Args:
+            **kwargs: Arbitrary keyword arguments representing metric names and their values.
+        """
+        current_time = time.time()
+        # Update cumulative metrics
+        for key, value in kwargs.items():
+            self.total_metrics[key] += value
+        # Append current metrics with timestamp to the deque
+        self.window_metrics.append((current_time, kwargs))
+        # Update window sums
+        for key, value in kwargs.items():
+            self.window_sum[key] += value
+        # Remove metrics that are outside the time window
+        while self.window_metrics and self.window_metrics[0][0] < current_time - self.window:
+            old_time, old_metrics = self.window_metrics.popleft()
+            for key, value in old_metrics.items():
+                self.window_sum[key] -= value
+                if self.window_sum[key] <= 0:
+                    del self.window_sum[key]  # Clean up to prevent negative counts
+    def __str__(self):
+        """
+        Returns a formatted string of metrics showing tokens/sec since start and within the window.
+        Returns:
+            str: Formatted metrics string as a table.
+        """
+        current_time = time.time()
+        elapsed_time = current_time - self.start_time
+        window_time = min(self.window, elapsed_time) if elapsed_time > 0 else 1  # Prevent division by zero
+        # Header
+        header = f"{'Metric Name':<30} {'Lifetime (tokens/sec)':>25} {'Recently (tokens/sec)':>25}"
+        separator = "-" * len(header)
+        lines = [header, separator]
+        # Sort metrics alphabetically for consistency
+        for key in sorted(self.total_metrics.keys()):
+            total = self.total_metrics[key]
+            window = self.window_sum.get(key, 0)
+            total_rate = total / elapsed_time if elapsed_time > 0 else 0
+            window_rate = window / window_time if window_time > 0 else 0
+            line = f"{key:<20} {total_rate:>25.2f} {window_rate:>25.2f}"
+            lines.append(line)
+        return "\n".join(lines)
+class WorkerTracker:
+    def __init__(self):
+        """
+        Initializes the WorkerTracker with a default dictionary.
+        Each worker ID maps to another dictionary that holds counts for each state.
+        """
+        # Mapping from worker_id to a dictionary of state counts
+        self.worker_status: Dict[int, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
+        self.lock = asyncio.Lock()
+    async def clear_work(self, worker_id: int):
+        async with self.lock:
+            self.worker_status[worker_id].clear()
+    async def track_work(self, worker_id: int, work_item_id: str, state: str):
+        """
+        Update the state count for a specific worker.
+        Args:
+            worker_id (int): The ID of the worker.
+            work_item_id (str): The unique identifier of the work item (unused in this implementation).
+            state (str): The state to increment for the work item.
+        """
+        async with self.lock:
+            self.worker_status[worker_id][state] += 1
+    async def get_status_table(self) -> str:
+        """
+        Generate a formatted table of the current status of all workers.
+        Returns:
+            str: A string representation of the workers' statuses.
+        """
+        async with self.lock:
+            # Determine all unique states across all workers
+            all_states: Set[str] = set()
+            for states in self.worker_status.values():
+                all_states.update(states.keys())
+            sorted_states: List[str] = sorted(all_states)
+            headers = ["Worker ID"] + sorted_states  # type: ignore
+            rows = []
+            for worker_id, states in sorted(self.worker_status.items()):
+                row = [str(worker_id)]
+                for state in sorted_states:
+                    count = states.get(state, 0)
+                    row.append(str(count))
+                rows.append(row)
+            # Calculate column widths
+            col_widths = [len(header) for header in headers]
+            for row in rows:
+                for idx, cell in enumerate(row):
+                    col_widths[idx] = max(col_widths[idx], len(cell))
+            # Create the table header
+            header_line = " | ".join(header.ljust(col_widths[idx]) for idx, header in enumerate(headers))
+            separator = "-+-".join("-" * col_widths[idx] for idx in range(len(headers)))
+            # Create the table rows
+            row_lines = [" | ".join(cell.ljust(col_widths[idx]) for idx, cell in enumerate(row)) for row in rows]
+            # Combine all parts
+            table = "\n".join([header_line, separator] + row_lines)
+            return table
+    def __str__(self):
+        """
+        String representation is not directly supported.
+        Use 'await get_status_table()' to retrieve the status table.
+        """
+        raise NotImplementedError("Use 'await get_status_table()' to get the status table.")

ocrflux/pipeline.py ADDED Viewed

	@@ -0,0 +1,861 @@

+import argparse
+import asyncio
+import atexit
+import base64
+import json
+import logging
+import shutil
+import os
+import copy
+import random
+import re
+import sys
+import time
+from concurrent.futures.process import BrokenProcessPool
+from io import BytesIO
+from urllib.parse import urlparse
+import httpx
+from huggingface_hub import snapshot_download
+from PIL import Image
+from pypdf import PdfReader
+from tqdm import tqdm
+from ocrflux.check import (
+    check_poppler_version,
+    check_vllm_version,
+    check_torch_gpu_available,
+)
+from ocrflux.image_utils import get_page_image, is_image
+from ocrflux.table_format import trans_markdown_text
+from ocrflux.metrics import MetricsKeeper, WorkerTracker
+from ocrflux.prompts import PageResponse, build_page_to_markdown_prompt, build_element_merge_detect_prompt, build_html_table_merge_prompt
+from ocrflux.work_queue import LocalWorkQueue, WorkQueue
+# Initialize logger
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+logger.propagate = False
+vllm_logger = logging.getLogger("vllm")
+vllm_logger.propagate = False
+file_handler = logging.FileHandler("OCRFlux-debug.log", mode="a")
+file_handler.setLevel(logging.DEBUG)
+file_handler.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s"))
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.INFO)
+console_handler.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s"))
+# Add handlers to the logger
+logger.addHandler(file_handler)
+logger.addHandler(console_handler)
+vllm_logger.addHandler(file_handler)
+# Quiet logs from pypdf
+logging.getLogger("pypdf").setLevel(logging.ERROR)
+# Global variables for token statistics
+metrics = MetricsKeeper(window=60 * 5)
+tracker = WorkerTracker()
+def build_page_to_markdown_query(args, pdf_path: str, page_number: int, target_longest_image_dim: int, image_rotation: int = 0) -> dict:
+    assert image_rotation in [0, 90, 180, 270], "Invalid image rotation provided in build_page_query"
+    image = get_page_image(pdf_path, page_number, target_longest_image_dim=target_longest_image_dim, image_rotation=image_rotation)
+    buffered = BytesIO()
+    image.save(buffered, format="PNG")
+    image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    return {
+        "model": args.model,
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": build_page_to_markdown_prompt()},
+                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
+                ],
+            }
+        ],
+        "temperature": 0.0,
+    }
+def build_element_merge_detect_query(args,text_list_1,text_list_2) -> dict:
+    image = Image.new('RGB', (28, 28), color='black')
+    buffered = BytesIO()
+    image.save(buffered, format="PNG")
+    image_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+    return {
+        "model": args.model,
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": build_element_merge_detect_prompt(text_list_1,text_list_2)},
+                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
+                ],
+            }
+        ],
+        "temperature": 0.0,
+    }
+def build_html_table_merge_query(args,text_1,text_2) -> dict:
+    image = Image.new('RGB', (28, 28), color='black')
+    buffered = BytesIO()
+    image.save(buffered, format="PNG")
+    image_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+    return {
+        "model": args.model,
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": build_html_table_merge_prompt(text_1,text_2)},
+                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
+                ],
+            }
+        ],
+        "temperature": 0.0,
+    }
+# Manual simple implementation of HTTP Post
+# It feels strange perhaps, but httpx and aiohttp are very complex beasts
+# Ex. the sessionpool in httpcore has 4 different locks in it, and I've noticed
+# that at the scale of 100M+ requests, that they deadlock in different strange ways
+async def apost(url, json_data):
+    parsed_url = urlparse(url)
+    host = parsed_url.hostname
+    port = parsed_url.port or 80
+    path = parsed_url.path or "/"
+    writer = None
+    try:
+        reader, writer = await asyncio.open_connection(host, port)
+        json_payload = json.dumps(json_data)
+        request = (
+            f"POST {path} HTTP/1.1\r\n"
+            f"Host: {host}\r\n"
+            f"Content-Type: application/json\r\n"
+            f"Content-Length: {len(json_payload)}\r\n"
+            f"Connection: close\r\n\r\n"
+            f"{json_payload}"
+        )
+        writer.write(request.encode())
+        await writer.drain()
+        # Read status line
+        status_line = await reader.readline()
+        if not status_line:
+            raise ConnectionError("No response from server")
+        status_parts = status_line.decode().strip().split(" ", 2)
+        if len(status_parts) < 2:
+            raise ValueError(f"Malformed status line: {status_line.decode().strip()}")
+        status_code = int(status_parts[1])
+        # Read headers
+        headers = {}
+        while True:
+            line = await reader.readline()
+            if line in (b"\r\n", b"\n", b""):
+                break
+            key, _, value = line.decode().partition(":")
+            headers[key.strip().lower()] = value.strip()
+        # Read response body
+        if "content-length" in headers:
+            body_length = int(headers["content-length"])
+            response_body = await reader.readexactly(body_length)
+        else:
+            raise ConnectionError("Anything other than fixed content length responses are not implemented yet")
+        return status_code, response_body
+    except Exception as e:
+        # Pass through errors
+        raise e
+    finally:
+        # But just make sure to close the socket on your way out
+        if writer is not None:
+            try:
+                writer.close()
+                await writer.wait_closed()
+            except:
+                pass
+async def process_task(args, worker_id, task_name, task_args):
+    COMPLETION_URL = f"http://localhost:{args.port}/v1/chat/completions"
+    MAX_RETRIES = args.max_page_retries
+    TEMPERATURE_BY_ATTEMPT = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
+    exponential_backoffs = 0
+    local_image_rotation = 0
+    attempt = 0
+    await tracker.track_work(worker_id, f"{worker_id}", "started")
+    while attempt < MAX_RETRIES:
+        if task_name == 'page_to_markdown':
+            pdf_path,page_number = task_args
+            query = build_page_to_markdown_query(args, pdf_path, page_number, args.target_longest_image_dim, image_rotation=local_image_rotation)
+        elif task_name == 'element_merge_detect':
+            text_list_1,text_list_2 = task_args
+            query = build_element_merge_detect_query(args, text_list_1, text_list_2)
+        elif task_name == 'html_table_merge':
+            table_1,table_2 = task_args
+            query = build_html_table_merge_query(args, table_1, table_2)
+        query["temperature"] = TEMPERATURE_BY_ATTEMPT[
+            min(attempt, len(TEMPERATURE_BY_ATTEMPT) - 1)
+        ]  # Change temperature as number of attempts increases to overcome repetition issues at expense of quality
+        try:
+            status_code, response_body = await apost(COMPLETION_URL, json_data=query)
+            if status_code == 400:
+                raise ValueError(f"Got BadRequestError from server: {response_body}, skipping this response")
+            elif status_code == 500:
+                raise ValueError(f"Got InternalServerError from server: {response_body}, skipping this response")
+            elif status_code != 200:
+                raise ValueError(f"Error http status {status_code}")
+            base_response_data = json.loads(response_body)
+            metrics.add_metrics(
+                vllm_input_tokens=base_response_data["usage"].get("prompt_tokens", 0),
+                vllm_output_tokens=base_response_data["usage"].get("completion_tokens", 0),
+            )
+            response_content = base_response_data["choices"][0]["message"]["content"]
+            if task_name == 'page_to_markdown':
+                model_response_json = json.loads(response_content)
+                page_response = PageResponse(**model_response_json)
+                if not page_response.is_rotation_valid and attempt < MAX_RETRIES - 1:
+                    local_image_rotation = page_response.rotation_correction
+                    raise ValueError(f"invalid_page rotation")
+                try:
+                    return_data = trans_markdown_text(page_response.natural_text,"matrix2html")
+                except:
+                    if attempt < MAX_RETRIES - 1:
+                        raise
+                    else:
+                        return_data = page_response.natural_text.replace("<t>","").replace("<l>","").replace("<lt>","")
+            elif task_name == 'element_merge_detect':
+                pattern = r"\((\d+), (\d+)\)"
+                matches = re.findall(pattern, response_content)
+                return_data = [(int(x), int(y)) for x, y in matches]
+            elif task_name == 'html_table_merge':
+                if not (response_content.startswith("<table>") and response_content.endswith("</table>")):
+                    raise ValueError("Response is not a table")
+                return_data = response_content
+            else:
+                raise ValueError(f"Unknown task_name {task_name}")
+            await tracker.track_work(worker_id, f"{worker_id}", "finished")
+            return return_data
+        except (ConnectionError, OSError, asyncio.TimeoutError) as e:
+            logger.warning(f"Client error on attempt {attempt} for {worker_id}: {type(e)} {e}")
+            # Now we want to do exponential backoff, and not count this as an actual page retry
+            # Page retrys are supposed to be for fixing bad results from the model, but actual requests to vllm
+            # are supposed to work. Probably this means that the server is just restarting
+            sleep_delay = 10 * (2**exponential_backoffs)
+            exponential_backoffs += 1
+            logger.info(f"Sleeping for {sleep_delay} seconds on {worker_id} to allow server restart")
+            await asyncio.sleep(sleep_delay)
+        except asyncio.CancelledError:
+            logger.info(f"Process {worker_id} cancelled")
+            await tracker.track_work(worker_id, f"{worker_id}", "cancelled")
+            raise
+        except json.JSONDecodeError as e:
+            logger.warning(f"JSON decode error on attempt {attempt} for {worker_id}: {e}")
+            attempt += 1
+        except ValueError as e:
+            logger.warning(f"ValueError on attempt {attempt} for {worker_id}: {type(e)} - {e}")
+            attempt += 1
+        except Exception as e:
+            logger.exception(f"Unexpected error on attempt {attempt} for {worker_id}: {type(e)} - {e}")
+            attempt += 1
+    logger.error(f"Failed to process {worker_id} after {MAX_RETRIES} attempts.")
+    await tracker.track_work(worker_id, f"{worker_id}", "errored")
+    return None
+def postprocess_markdown_text(args, response_text, pdf_path, page_number):
+    text_list = response_text.split("\n\n")
+    new_text_list = []
+    for text in text_list:
+        if text.startswith("<Image>") and text.endswith("</Image>"):
+            pass
+        else:
+            new_text_list.append(text)
+    return "\n\n".join(new_text_list)
+def bulid_document_text(page_to_markdown_result, element_merge_detect_result, html_table_merge_result):
+    page_to_markdown_keys = list(page_to_markdown_result.keys())
+    element_merge_detect_keys = list(element_merge_detect_result.keys())
+    html_table_merge_keys = list(html_table_merge_result.keys())
+    for page_1,page_2,elem_idx_1,elem_idx_2 in sorted(html_table_merge_keys,key=lambda x: -x[0]):
+        page_to_markdown_result[page_1][elem_idx_1] = html_table_merge_result[(page_1,page_2,elem_idx_1,elem_idx_2)]
+        page_to_markdown_result[page_2][elem_idx_2] = ''
+    for page_1,page_2 in sorted(element_merge_detect_keys,key=lambda x: -x[0]):
+        for elem_idx_1,elem_idx_2 in element_merge_detect_result[(page_1,page_2)]:
+            if len(page_to_markdown_result[page_1][elem_idx_1]) == 0 or page_to_markdown_result[page_1][elem_idx_1][-1] == '-' or ('\u4e00' <= page_to_markdown_result[page_1][elem_idx_1][-1] <= '\u9fff'):
+                page_to_markdown_result[page_1][elem_idx_1] = page_to_markdown_result[page_1][elem_idx_1] + '' + page_to_markdown_result[page_2][elem_idx_2]
+            else:
+                page_to_markdown_result[page_1][elem_idx_1] = page_to_markdown_result[page_1][elem_idx_1] + ' ' + page_to_markdown_result[page_2][elem_idx_2]
+            page_to_markdown_result[page_2][elem_idx_2] = ''
+    document_text_list = []
+    for page in page_to_markdown_keys:
+        page_text_list = [s for s in page_to_markdown_result[page] if s]
+        document_text_list += page_text_list
+    return "\n\n".join(document_text_list)
+async def process_pdf(args, worker_id: int, pdf_path: str):
+    logger.info(f"Start process_pdf for {pdf_path}")
+    if pdf_path.lower().endswith(".pdf"):
+        try:
+            reader = PdfReader(pdf_path)
+            num_pages = reader.get_num_pages()
+        except:
+            logger.exception(f"Could not count number of pages for {pdf_path}, aborting document")
+            return None
+    else:
+        num_pages = 1
+    logger.info(f"Got {num_pages} pages to do for {pdf_path} in worker {worker_id}")
+    try:
+        tasks = []
+        results = []
+        async with asyncio.TaskGroup() as tg:
+            for page_num in range(1, num_pages + 1):
+                task = tg.create_task(process_task(args, worker_id, task_name='page_to_markdown', task_args=(pdf_path,page_num)))
+                tasks.append(task)
+        results = [task.result() for task in tasks]
+        fallback_pages = []
+        page_to_markdown_result = {}
+        page_pairs = []
+        for i,result in enumerate(results):
+            if result != None:
+                page_number = i+1
+                page_to_markdown_result[i+1] = postprocess_markdown_text(args,result,pdf_path,page_number).split("\n\n")
+                if page_number-1 in page_to_markdown_result.keys():
+                    page_pairs.append((page_number-1,page_number))
+            else:
+                fallback_pages.append(i)
+        num_fallback_pages = len(fallback_pages)
+        if num_fallback_pages / num_pages > args.max_page_error_rate:
+            logger.error(
+                f"Document {pdf_path} has {num_fallback_pages} fallback pages out of {num_pages} exceeding max_page_error_rate of {args.max_page_error_rate}, discarding document."
+            )
+            return None
+        elif num_fallback_pages > 0:
+            logger.warning(
+                f"Document {pdf_path} processed with {num_fallback_pages} fallback pages out of {num_pages}."
+            )
+        if args.skip_cross_page_merge:
+            page_texts = {}
+            document_text_list = []
+            sorted_page_keys = sorted(list(page_to_markdown_result.keys()))
+            for page_number in sorted_page_keys:
+                page_texts[str(page_number-1)] = "\n\n".join(page_to_markdown_result[page_number])
+                document_text_list.append(page_texts[str(page_number-1)])
+            document_text = "\n\n".join(document_text_list)
+            return {
+                "orig_path": pdf_path,
+                "num_pages": num_pages,
+                "document_text": document_text,
+                "page_texts": page_texts,
+                "fallback_pages": fallback_pages,
+            }
+        tasks = []
+        results = []
+        async with asyncio.TaskGroup() as tg:
+            for page_1,page_2 in page_pairs:
+                task = tg.create_task(process_task(args, worker_id, task_name='element_merge_detect', task_args=(page_to_markdown_result[page_1], page_to_markdown_result[page_2])))
+                tasks.append(task)
+        results = [task.result() for task in tasks]
+        element_merge_detect_result = {}
+        table_pairs = []
+        for page_pair,result in zip(page_pairs,results):
+            if result != None:
+                page_1,page_2 = page_pair
+                element_merge_detect_result[(page_1,page_2)] = result
+                for elem_idx_1,elem_idx_2 in result:
+                    text_1 = page_to_markdown_result[page_1][elem_idx_1]
+                    text_2 = page_to_markdown_result[page_2][elem_idx_2]
+                    if text_1.startswith("<table>") and text_1.endswith("</table>") and text_2.startswith("<table>") and text_2.endswith("</table>"):
+                        table_pairs.append((page_1,page_2,elem_idx_1,elem_idx_2))
+        tmp_page_to_markdown_result = copy.deepcopy(page_to_markdown_result)
+        table_pairs = sorted(table_pairs,key=lambda x: -x[0])
+        html_table_merge_result = {}
+        i = 0
+        while i < len(table_pairs):
+            async with asyncio.TaskGroup() as tg:
+                tasks = []
+                ids_1 = []
+                ids_2 = []
+                page_1,page_2,elem_idx_1,elem_idx_2 = table_pairs[i]
+                task = tg.create_task(process_task(args, worker_id, task_name='html_table_merge', task_args=(tmp_page_to_markdown_result[page_1][elem_idx_1], tmp_page_to_markdown_result[page_2][elem_idx_2])))
+                tasks.append(task)
+                ids_1.append((page_1,elem_idx_1))
+                ids_2.append((page_2,elem_idx_2))
+                j = i + 1
+                while j < len(table_pairs):
+                    page_1,page_2,elem_idx_1,elem_idx_2 = table_pairs[j]
+                    if (page_2, elem_idx_2) not in ids_1:
+                        task = tg.create_task(process_task(args, worker_id, task_name='html_table_merge', task_args=(tmp_page_to_markdown_result[page_1][elem_idx_1], tmp_page_to_markdown_result[page_2][elem_idx_2])))
+                        tasks.append(task)
+                        ids_1.append((page_1,elem_idx_1))
+                        ids_2.append((page_2,elem_idx_2))
+                        j = j + 1
+                    else:
+                        break
+            results = [task.result() for task in tasks]
+            for k,result in enumerate(results):
+                page_1,elem_idx_1 = ids_1[k]
+                page_2,elem_idx_2 = ids_2[k]
+                if result != None:
+                    html_table_merge_result[(page_1,page_2,elem_idx_1,elem_idx_2)] = result
+                    tmp_page_to_markdown_result[page_1][elem_idx_1] = html_table_merge_result[(page_1,page_2,elem_idx_1,elem_idx_2)]
+            i = j
+        page_texts = {}
+        for page_number in page_to_markdown_result.keys():
+            page_texts[str(page_number-1)] = "\n\n".join(page_to_markdown_result[page_number])
+        document_text = bulid_document_text(page_to_markdown_result, element_merge_detect_result, html_table_merge_result)
+        return {
+            "orig_path": pdf_path,
+            "num_pages": num_pages,
+            "document_text": document_text,
+            "page_texts": page_texts,
+            "fallback_pages": fallback_pages,
+        }
+    except Exception as e:
+        # Check for ExceptionGroup with BrokenProcessPool
+        if isinstance(e, ExceptionGroup):
+            broken_pool, other = e.split(BrokenProcessPool)
+            if broken_pool is not None:  # Found at least one BrokenProcessPool
+                logger.critical("Encountered BrokenProcessPool, exiting process.")
+                sys.exit(1)
+        logger.exception(f"Exception in process_pdf for {pdf_path}: {e}")
+        return None
+async def process_json(args, worker_id: int, json_path: str):
+    try:
+        json_data = json.load(open(json_path,'r'))
+    except:
+        logger.exception(f"Could not load {json_path}, aborting document")
+    try:
+        if args.task == 'merge_pages':
+            page_1 = json_data['page_1'].split("\n\n")
+            page_2 = json_data['page_2'].split("\n\n")
+            async with asyncio.TaskGroup() as tg:
+                task = tg.create_task(process_task(args, worker_id, task_name='element_merge_detect', task_args=(page_1, page_2)))
+            result = task.result()
+            return {
+                "orig_path": json_path,
+                "merge_pairs": result
+            }
+        elif args.task == 'merge_tables':
+            table_1 = json_data['table_1']
+            table_2 = json_data['table_2']
+            async with asyncio.TaskGroup() as tg:
+                task = tg.create_task(process_task(args, worker_id, task_name='html_table_merge', task_args=(table_1, table_2)))
+            result = task.result()
+            return {
+                "orig_path": json_path,
+                "merged_tables": result
+            }
+        else:
+            raise ValueError(f"Unknown task {args.task}")
+    except Exception as e:
+        # Check for ExceptionGroup with BrokenProcessPool
+        if isinstance(e, ExceptionGroup):
+            broken_pool, other = e.split(BrokenProcessPool)
+            if broken_pool is not None:  # Found at least one BrokenProcessPool
+                logger.critical("Encountered BrokenProcessPool, exiting process.")
+                sys.exit(1)
+        logger.exception(f"Exception in process_json for {json_path}: {e}")
+        return None
+async def worker(args, work_queue: WorkQueue, semaphore, worker_id):
+    while True:
+        # Wait until allowed to proceed
+        await semaphore.acquire()
+        work_item = await work_queue.get_work()
+        if work_item is None:
+            logger.info(f"Worker {worker_id} exiting due to empty queue")
+            semaphore.release()
+            break
+        logger.info(f"Worker {worker_id} processing work item {work_item.hash}")
+        await tracker.clear_work(worker_id)
+        try:
+            async with asyncio.TaskGroup() as tg:
+                if args.task == 'pdf2markdown':
+                    tasks = [tg.create_task(process_pdf(args, worker_id, pdf_path)) for pdf_path in work_item.work_paths]
+                elif args.task == 'merge_pages' or args.task == 'merge_tables':
+                    tasks = [tg.create_task(process_json(args, worker_id, json_path)) for json_path in work_item.work_paths]
+                else:
+                    raise ValueError(f"Unknown task {args.task}")
+                logger.info(f"Created all tasks for {work_item.hash}")
+            logger.info(f"Finished TaskGroup for worker on {work_item.hash}")
+            results = []
+            for task in tasks:
+                try:
+                    result = task.result()
+                except:
+                    pass
+                if result is not None:
+                    results.append(result)
+            logger.info(f"Got {len(results)} docs for {work_item.hash}")
+            output_final_path = os.path.join(args.workspace, "results", f"output_{work_item.hash}.jsonl")
+            with open(output_final_path, "w") as f:
+                for result in results:
+                    f.write(json.dumps(result))
+                    f.write("\n")
+            await work_queue.mark_done(work_item)
+        except Exception as e:
+            logger.exception(f"Exception occurred while processing work_hash {work_item.hash}: {e}")
+        finally:
+            semaphore.release()
+async def vllm_server_task(args, semaphore):
+    model_name_or_path = args.model
+    cmd = [
+        "vllm",
+        "serve",
+         model_name_or_path,
+        "--port",
+        str(args.port),
+        "--max-model-len",
+        str(args.model_max_context),
+        "--gpu_memory_utilization",
+        str(0.8)
+    ]
+    proc = await asyncio.create_subprocess_exec(
+        *cmd,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+    )
+    # Ensure the subprocess is terminated on exit
+    def _kill_proc():
+        proc.terminate()
+    atexit.register(_kill_proc)
+    # Shared variables between tasks
+    last_running_req, last_queue_req = 0, 0
+    server_printed_ready_message = False
+    last_semaphore_release = time.time()
+    async def process_line(line):
+        nonlocal last_running_req, last_queue_req, last_semaphore_release, server_printed_ready_message
+        vllm_logger.info(line)
+        # if the server hasn't initialized yet, log all the lines to the main logger also, so that the user
+        # can see any warnings/errors more easily
+        if not server_printed_ready_message:
+            logger.info(line)
+        if "Detected errors during sampling" in line:
+            logger.error("Cannot continue, sampling errors detected, model is probably corrupt")
+            sys.exit(1)
+        # TODO, need to trace down this issue in vllm itself, but it will otherwise cause the server to lock up
+        if "IndexError: list index out of range" in line:
+            logger.error("IndexError in model, restarting server")
+            proc.terminate()
+        if not server_printed_ready_message and "The server is fired up and ready to roll!" in line:
+            server_printed_ready_message = True
+            last_semaphore_release = time.time()
+        match = re.search(r"Running: (\d+)", line)
+        if match:
+            last_running_req = int(match.group(1))
+        match = re.search(r"(?:Waiting|Pending):\s*(\d+)", line)
+        if match:
+            last_queue_req = int(match.group(1))
+            logger.info(f"vllm running req: {last_running_req} queue req: {last_queue_req}")
+    async def read_stream(stream):
+        while True:
+            line = await stream.readline()
+            if not line:
+                break
+            try:
+                line = line.decode("utf-8").rstrip()
+                await process_line(line)
+            except Exception as ex:
+                logger.warning(f"Got {ex} when reading log line from inference server, skipping")
+    async def timeout_task():
+        nonlocal last_running_req, last_queue_req, last_semaphore_release
+        try:
+            while True:
+                await asyncio.sleep(1)
+                if server_printed_ready_message and last_queue_req == 0 and time.time() - last_semaphore_release > 30 and semaphore.locked():
+                    semaphore.release()
+                    last_semaphore_release = time.time()
+                    logger.info("Semaphore released, allowing a worker to proceed.")
+        except asyncio.CancelledError:
+            pass  # Clean up if the task is cancelled
+    # Start tasks to read stdout, stderr, and handle timeout logic
+    stdout_task = asyncio.create_task(read_stream(proc.stdout))
+    stderr_task = asyncio.create_task(read_stream(proc.stderr))
+    timeout_task = asyncio.create_task(timeout_task())
+    try:
+        await proc.wait()
+    except asyncio.CancelledError:
+        logger.info("Got cancellation request for VLLM server")
+        proc.terminate()
+        raise
+    timeout_task.cancel()
+    await asyncio.gather(stdout_task, stderr_task, timeout_task, return_exceptions=True)
+async def vllm_server_host(args, semaphore):
+    MAX_RETRIES = 5
+    retry = 0
+    while retry < MAX_RETRIES:
+        await vllm_server_task(args, semaphore)
+        logger.warning("VLLM server task ended")
+        retry += 1
+    if retry >= MAX_RETRIES:
+        logger.error(f"Ended up starting the vllm server more than {retry} times, cancelling pipeline")
+        logger.error("")
+        logger.error("Please make sure vllm is installed according to the latest instructions here: https://docs.vllm.ai/start/install.html")
+        sys.exit(1)
+async def vllm_server_ready(args):
+    max_attempts = 300
+    delay_sec = 1
+    url = f"http://localhost:{args.port}/v1/models"
+    for attempt in range(1, max_attempts + 1):
+        try:
+            async with httpx.AsyncClient() as session:
+                response = await session.get(url)
+                if response.status_code == 200:
+                    logger.info("vllm server is ready.")
+                    return
+                else:
+                    logger.info(f"Attempt {attempt}: Unexpected status code {response.status_code}")
+        except Exception:
+            logger.warning(f"Attempt {attempt}: Please wait for vllm server to become ready...")
+        await asyncio.sleep(delay_sec)
+    raise Exception("vllm server did not become ready after waiting.")
+async def download_model(model_name_or_path: str):
+    if os.path.isabs(model_name_or_path) and os.path.isdir(model_name_or_path):
+        logger.info(f"Using local model path at '{model_name_or_path}'")
+    else:
+        logger.info(f"Downloading model with hugging face '{model_name_or_path}'")
+        snapshot_download(repo_id=model_name_or_path)
+async def metrics_reporter(work_queue):
+    while True:
+        # Leading newlines preserve table formatting in logs
+        logger.info(f"Queue remaining: {work_queue.size}")
+        logger.info("\n" + str(metrics))
+        logger.info("\n" + str(await tracker.get_status_table()))
+        await asyncio.sleep(10)
+async def main():
+    parser = argparse.ArgumentParser(description="Manager for running millions of PDFs through a batch inference pipeline")
+    parser.add_argument(
+        "workspace",
+        help="The filesystem path where work will be stored, can be a local folder",
+    )
+    parser.add_argument("--task", type=str, choices=['pdf2markdown','merge_pages','merge_tables'], default='pdf2markdown', help="task names, could be 'pdf2markdown', 'merge_pages' or 'merge_tables'")
+    parser.add_argument(
+        "--data",
+        nargs="*",
+        help="List of paths to files to process",
+        default=None,
+    )
+    parser.add_argument("--pages_per_group", type=int, default=500, help="Aiming for this many pdf pages per work item group")
+    parser.add_argument("--max_page_retries", type=int, default=8, help="Max number of times we will retry rendering a page")
+    parser.add_argument("--max_page_error_rate", type=float, default=0.004, help="Rate of allowable failed pages in a document, 1/250 by default")
+    parser.add_argument("--workers", type=int, default=8, help="Number of workers to run at a time")
+    # Model parameters
+    parser.add_argument(
+        "--model",
+        help="The path to the model",
+        default="ChatDOC/OCRFlux-3B",
+    )
+    parser.add_argument("--model_max_context", type=int, default=16384, help="Maximum context length that the model was fine tuned under")
+    parser.add_argument("--model_chat_template", type=str, default="qwen2-vl", help="Chat template to pass to vllm server")
+    parser.add_argument("--target_longest_image_dim", type=int, help="Dimension on longest side to use for rendering the pdf pages", default=1024)
+    parser.add_argument("--skip_cross_page_merge", action="store_true", help="Whether to skip cross-page merging")
+    parser.add_argument("--port", type=int, default=40078, help="Port to use for the VLLM server")
+    args = parser.parse_args()
+    if os.path.exists(args.workspace):
+        shutil.rmtree(args.workspace)
+    # We need poppler to load the initial pdfs, even if we are not processing them here
+    check_poppler_version()
+    work_queue = LocalWorkQueue(args.workspace)
+    if args.task == 'pdf2markdown':
+        pdf_work_paths = set()
+        for pdf_path in args.data:
+            if os.path.exists(pdf_path):
+                if pdf_path.lower().endswith(".pdf") and open(pdf_path, "rb").read(4) == b"%PDF":
+                    logger.info(f"Loading file at {pdf_path} as PDF document")
+                    pdf_work_paths.add(pdf_path)
+                elif is_image(pdf_path):
+                    logger.info(f"Loading file at {pdf_path} as image document")
+                    pdf_work_paths.add(pdf_path)
+                else:
+                    raise ValueError(f"Unsupported file extension for {pdf_path}")
+            else:
+                raise ValueError(f"{pdf_path} does not exist")
+        logger.info(f"Found {len(pdf_work_paths):,} total pdf paths to add")
+        # Estimate average pages per pdf
+        sample_size = min(100, len(pdf_work_paths))
+        sampled_pdfs = random.sample(list(pdf_work_paths), sample_size)
+        page_counts = []
+        for pdf_path in tqdm(sampled_pdfs, desc="Sampling PDFs to calculate optimal length"):
+            try:
+                if pdf_path.lower().endswith(".pdf"):
+                    reader = PdfReader(pdf_path)
+                    page_counts.append(len(reader.pages))
+                else:
+                    page_counts.append(1)
+            except Exception as e:
+                logger.warning(f"Failed to read {pdf_path}: {e}")
+        if page_counts:
+            avg_pages_per_pdf = sum(page_counts) / len(page_counts)
+        else:
+            logger.warning("Could not read any PDFs to estimate average page count.")
+            avg_pages_per_pdf = 10  # Default to 10 pages per PDF if sampling fails
+        items_per_group = max(1, int(args.pages_per_group / avg_pages_per_pdf))
+        logger.info(f"Calculated items_per_group: {items_per_group} based on average pages per PDF: {avg_pages_per_pdf:.2f}")
+        # Now call populate_queue
+        await work_queue.populate_queue(pdf_work_paths, items_per_group)
+    elif args.task == 'merge_pages' or args.task == 'merge_tables':
+        json_work_paths = set()
+        for json_path in args.data:
+            if os.path.exists(json_path):
+                if json_path.lower().endswith(".json"):
+                    json_work_paths.add(json_path)
+                elif json_path.lower().endswith(".txt"):
+                    logger.info(f"Loading file at {json_path} as list of paths")
+                    with open(json_path, "r") as f:
+                        json_work_paths |= set(filter(None, (line.strip() for line in f)))
+                else:
+                    raise ValueError(f"Unsupported file extension for {json_path}")
+            else:
+                raise ValueError(f"{json_path} does not exist")
+        # Now call populate_queue
+        await work_queue.populate_queue(json_work_paths, args.pages_per_group)
+    # If you get this far, then you are doing inference and need a GPU
+    check_vllm_version()
+    check_torch_gpu_available()
+    logger.info(f"Starting pipeline with PID {os.getpid()}")
+    # Download the model before you do anything else
+    await download_model(args.model)
+    # Initialize the work queue
+    qsize = await work_queue.initialize_queue()
+    if qsize == 0:
+        logger.info("No work to do, exiting")
+        return
+    # Create a semaphore to control worker access
+    # We only allow one worker to move forward with requests, until the server has no more requests in its queue
+    # This lets us get full utilization by having many workers, but also to be outputting dolma docs as soon as possible
+    # As soon as one worker is no longer saturating the gpu, the next one can start sending requests
+    semaphore = asyncio.Semaphore(1)
+    vllm_server = asyncio.create_task(vllm_server_host(args, semaphore))
+    await vllm_server_ready(args)
+    metrics_task = asyncio.create_task(metrics_reporter(work_queue))
+    # Create worker tasks to process the queue concurrently.
+    worker_tasks = []
+    for i in range(args.workers):
+        task = asyncio.create_task(worker(args, work_queue, semaphore, worker_id=i))
+        worker_tasks.append(task)
+    # Wait for all worker tasks to finish
+    await asyncio.gather(*worker_tasks)
+    vllm_server.cancel()
+    metrics_task.cancel()
+    logger.info("Work done")
+if __name__ == "__main__":
+    asyncio.run(main())

ocrflux/prompts.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import re
+from dataclasses import dataclass
+from typing import Optional
+@dataclass(frozen=True)
+class PageResponse:
+    primary_language: Optional[str]
+    is_rotation_valid: bool
+    rotation_correction: int
+    is_table: bool
+    is_diagram: bool
+    natural_text: Optional[str]
+    def __post_init__(self):
+        # Validate rotation_correction is one of the allowed values
+        if self.rotation_correction not in {0, 90, 180, 270}:
+            raise ValueError("rotation_correction must be one of [0, 90, 180, 270].")
+        # Type checks
+        if not isinstance(self.primary_language, (str, type(None))):
+            raise TypeError("primary_language must be of type Optional[str].")
+        if not isinstance(self.is_rotation_valid, bool):
+            raise TypeError("is_rotation_valid must be of type bool.")
+        if not isinstance(self.rotation_correction, int):
+            raise TypeError("rotation_correction must be of type int.")
+        if not isinstance(self.is_table, bool):
+            raise TypeError("is_table must be of type bool.")
+        if not isinstance(self.is_diagram, bool):
+            raise TypeError("is_diagram must be of type bool.")
+        if not isinstance(self.natural_text, (str, type(None))):
+            raise TypeError("natural_text must be of type Optional[str].")
+def build_element_merge_detect_prompt(text_list_1,text_list_2) -> str:
+    task = '''Below are two consecutive pages in Markdown format, where each element of them is numbered. Identify pairs of elements which should be merged across the two pages, such as text paragraphs or tables that span across the two pages. Return pairs as [(element_index_of_page1, element_index_of_page2), ...] or [] if no elements should be merged.\n'''
+    task += "Previous page:\n"
+    for i,text in  enumerate(text_list_1):
+        task += f"{i}. {text}\n\n"
+    task += "Next page:\n"
+    for i,text in  enumerate(text_list_2):
+        task += f"{i}. {text}\n\n"
+    return task
+def build_html_table_merge_prompt(table1,table2) -> str:
+    return (
+        f"Below are two tables in HTML format, merge them into one table in HTML format.\n"
+        f"TABLE 1:\n"
+        f"{table1}\n"
+        f"TABLE 2:\n"
+        f"{table2}\n"
+    )
+def build_page_to_markdown_prompt() -> str:
+    return (
+        f"Below is the image of one page of a document. "
+        f"Just return the plain text representation of this document as if you were reading it naturally.\n"
+        f"ALL tables should be presented in HTML format.\n"
+        f"If there are images or figures in the page, present them as \"<Image>(left,top),(right,bottom)</Image>\", (left,top,right,bottom) are the coordinates of the top-left and bottom-right corners of the image or figure.\n"
+        f"Present all titles and headings as H1 headings.\n"
+        f"Do not hallucinate.\n"
+    )

ocrflux/table_format.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from bs4 import BeautifulSoup
+import re
+def is_html_table(text):
+    soup = BeautifulSoup(text, "html.parser")
+    return soup.find('table') is not None
+def table_matrix2html(matrix_table):
+    soup = BeautifulSoup(matrix_table, 'html.parser')
+    table = soup.find('table')
+    rownum = 0
+    colnum = 0
+    cell_dict = {}
+    rid = 0
+    for tr in table.find_all('tr'):
+        cid = 0
+        for td in tr.find_all('td'):
+            if td.find('l'):
+                cell_dict[(rid, cid)] = '<l>'
+            elif td.find('t'):
+                cell_dict[(rid, cid)] = '<t>'
+            elif td.find('lt'):
+                cell_dict[(rid, cid)] = '<lt>'
+            else:
+                text = td.get_text(strip=True)
+                cell_dict[(rid, cid)] = text
+            cid += 1
+        if colnum == 0:
+            colnum = cid
+        elif cid != colnum:
+            raise Exception('colnum not match')
+        rid += 1
+    rownum = rid
+    html_table = ['<table>']
+    for rid in range(rownum):
+        html_table.append('<tr>')
+        for cid in range(colnum):
+            if (rid, cid) not in cell_dict.keys():
+                continue
+            text = cell_dict[(rid, cid)]
+            if text == '<l>' or text == '<t>' or text == '<lt>':
+                raise Exception('cell not match')
+            rowspan = 1
+            colspan = 1
+            for r in range(rid+1, rownum):
+                if (r, cid) in cell_dict.keys() and cell_dict[(r, cid)] == '<t>':
+                    rowspan += 1
+                    del cell_dict[(r, cid)]
+                else:
+                    break
+            for c in range(cid+1, colnum):
+                if (rid, c) in cell_dict.keys() and cell_dict[(rid, c)] == '<l>':
+                    colspan += 1
+                    del cell_dict[(rid, c)]
+                else:
+                    break
+            for r in range(rid+1, rid+rowspan):
+                for c in range(cid+1, cid+colspan):
+                    if cell_dict[(r, c)] != '<lt>':
+                        raise Exception('cell not match')
+                    del cell_dict[(r, c)]
+            attr = ''
+            if rowspan > 1:
+                attr += ' rowspan="{}"'.format(rowspan)
+            if colspan > 1:
+                attr += ' colspan="{}"'.format(colspan)
+            html_table.append("<td{}>{}</td>".format(attr, text))
+        html_table.append('</tr>')
+    html_table.append('</table>')
+    return "".join(html_table)
+def table_html2matrix(html_table):
+    soup = BeautifulSoup(html_table, 'html.parser')
+    table = soup.find('table')
+    rownum = len(table.find_all('tr'))
+    colnum = 0
+    tr = table.find_all('tr')[0]
+    for td in tr.find_all('td'):
+        colnum += td.get('colspan', 1)
+    matrix = [[None for _ in range(colnum)] for _ in range(rownum)]
+    rid = 0
+    for tr in table.find_all('tr'):
+        cid = 0
+        for td in tr.find_all('td'):
+            for c in range(cid, colnum):
+                if matrix[rid][c] is None:
+                    break
+            cid = c
+            rowspan = td.get('rowspan', 1)
+            colspan = td.get('colspan', 1)
+            cell_text = td.get_text(strip=True)
+            for r in range(rid,rid+rowspan):
+                if r >= rownum:
+                    raise Exception('rownum not match')
+                for c in range(cid,cid+colspan):
+                    if c >= colnum:
+                        raise Exception('colnum not match')
+                    if matrix[r][c] is not None:
+                        raise Exception('cell not match')
+                    if r == rid and c == cid:
+                        matrix[r][c] = cell_text
+                    elif r == rid:
+                        matrix[r][c] = '<l>'
+                    elif c == cid:
+                        matrix[r][c] = '<t>'
+                    else:
+                        matrix[r][c] = '<lt>'
+            cid += colspan
+        rid += 1
+    matrix_table = ['<table>']
+    for rid in range(rownum):
+        matrix_table.append('<tr>')
+        for cid in range(colnum):
+            matrix_table.append('<td>')
+            cell_text = matrix[rid][cid]
+            matrix_table.append(cell_text)
+            matrix_table.append('</td>')
+        matrix_table.append('</tr>')
+    matrix_table.append('</table>')
+    return "".join(matrix_table)
+trans_func = {
+    "html2matrix": table_html2matrix,
+    "matrix2html": table_matrix2html,
+}
+def trans_markdown_text(markdown_text,trans_type):
+    if markdown_text == None:
+        return None
+    text_list = markdown_text.split('\n\n')
+    for i,text in enumerate(text_list):
+        if is_html_table(text):
+            text_list[i] = trans_func[trans_type](text)
+    return "\n\n".join(text_list)

ocrflux/work_queue.py ADDED Viewed

	@@ -0,0 +1,357 @@

+import abc
+import asyncio
+import datetime
+import hashlib
+import logging
+import os
+import random
+from asyncio import Queue
+from dataclasses import dataclass
+from typing import Any, List, Optional
+logger = logging.getLogger(__name__)
+@dataclass
+class WorkItem:
+    """Represents a single work item in the queue"""
+    hash: str
+    work_paths: List[str]
+class WorkQueue(abc.ABC):
+    """
+    Base class defining the interface for a work queue.
+    """
+    @abc.abstractmethod
+    async def populate_queue(self, work_paths: List[str], items_per_group: int) -> None:
+        """
+        Add new items to the work queue. The specifics will vary depending on
+        whether this is a local or S3-backed queue.
+        Args:
+            work_paths: Each individual path that we will process over
+            items_per_group: Number of items to group together in a single work item
+        """
+        pass
+    @abc.abstractmethod
+    async def initialize_queue(self) -> int:
+        """
+        Load the work queue from the relevant store (local or remote)
+        and initialize it for processing.
+        For example, this might remove already completed work items and randomize
+        the order before adding them to an internal queue.
+        """
+        pass
+    @abc.abstractmethod
+    async def is_completed(self, work_hash: str) -> bool:
+        """
+        Check if a work item has been completed.
+        Args:
+            work_hash: Hash of the work item to check
+        Returns:
+            True if the work is completed, False otherwise
+        """
+        pass
+    @abc.abstractmethod
+    async def get_work(self, worker_lock_timeout_secs: int = 1800) -> Optional[WorkItem]:
+        """
+        Get the next available work item that isn't completed or locked.
+        Args:
+            worker_lock_timeout_secs: Number of seconds before considering
+                                      a worker lock stale (default 30 mins)
+        Returns:
+            WorkItem if work is available, None if queue is empty
+        """
+        pass
+    @abc.abstractmethod
+    async def mark_done(self, work_item: WorkItem) -> None:
+        """
+        Mark a work item as done by removing its lock file
+        or performing any other cleanup.
+        Args:
+            work_item: The WorkItem to mark as done
+        """
+        pass
+    @property
+    @abc.abstractmethod
+    def size(self) -> int:
+        """Get current size of work queue"""
+        pass
+    @staticmethod
+    def _compute_workgroup_hash(work_paths: List[str]) -> str:
+        """
+        Compute a deterministic hash for a group of paths.
+        Args:
+            work_paths: List of paths (local or S3)
+        Returns:
+            SHA1 hash of the sorted paths
+        """
+        sha1 = hashlib.sha1()
+        for path in sorted(work_paths):
+            sha1.update(path.encode("utf-8"))
+        return sha1.hexdigest()
+# --------------------------------------------------------------------------------------
+# Local Helpers for reading/writing the index CSV (compressed with zstd) to disk
+# --------------------------------------------------------------------------------------
+try:
+    import zstandard
+except ImportError:
+    zstandard = None
+def download_zstd_csv_local(local_path: str) -> List[str]:
+    """
+    Download a zstd-compressed CSV from a local path.
+    If the file doesn't exist, returns an empty list.
+    """
+    if not os.path.exists(local_path):
+        return []
+    if not zstandard:
+        raise RuntimeError("zstandard package is required for local zstd CSV operations.")
+    with open(local_path, "rb") as f:
+        dctx = zstandard.ZstdDecompressor()
+        data = dctx.decompress(f.read())
+    lines = data.decode("utf-8").splitlines()
+    return lines
+def upload_zstd_csv_local(local_path: str, lines: List[str]) -> None:
+    """
+    Upload a zstd-compressed CSV to a local path.
+    """
+    if not zstandard:
+        raise RuntimeError("zstandard package is required for local zstd CSV operations.")
+    data = "\n".join(lines).encode("utf-8")
+    cctx = zstandard.ZstdCompressor()
+    compressed_data = cctx.compress(data)
+    # Ensure parent directories exist
+    os.makedirs(os.path.dirname(local_path), exist_ok=True)
+    with open(local_path, "wb") as f:
+        f.write(compressed_data)
+# --------------------------------------------------------------------------------------
+# LocalWorkQueue Implementation
+# --------------------------------------------------------------------------------------
+class LocalWorkQueue(WorkQueue):
+    """
+    A local in-memory and on-disk WorkQueue implementation, which uses
+    a local workspace directory to store the queue index, lock files,
+    and completed results for persistent resumption across process restarts.
+    """
+    def __init__(self, workspace_path: str):
+        """
+        Initialize the local work queue.
+        Args:
+            workspace_path: Local directory path where the queue index,
+                            results, and locks are stored.
+        """
+        self.workspace_path = os.path.abspath(workspace_path)
+        os.makedirs(self.workspace_path, exist_ok=True)
+        # Local index file (compressed)
+        self._index_path = os.path.join(self.workspace_path, "work_index_list.csv.zstd")
+        # Output directory for completed tasks
+        self._results_dir = os.path.join(self.workspace_path, "results")
+        os.makedirs(self._results_dir, exist_ok=True)
+        # Directory for lock files
+        self._locks_dir = os.path.join(self.workspace_path, "worker_locks")
+        os.makedirs(self._locks_dir, exist_ok=True)
+        # Internal queue
+        self._queue: Queue[Any] = Queue()
+    async def populate_queue(self, work_paths: List[str], items_per_group: int) -> None:
+        """
+        Add new items to the work queue (local version).
+        Args:
+            work_paths: Each individual path (local in this context)
+                           that we will process over
+            items_per_group: Number of items to group together in a single work item
+        """
+        # Treat them as local paths, but keep variable name for consistency
+        all_paths = set(work_paths)
+        logger.info(f"Found {len(all_paths):,} total paths")
+        # Load existing work groups from local index
+        existing_lines = await asyncio.to_thread(download_zstd_csv_local, self._index_path)
+        existing_groups = {}
+        for line in existing_lines:
+            if line.strip():
+                parts = line.strip().split(",")
+                group_hash = parts[0]
+                group_paths = parts[1:]
+                existing_groups[group_hash] = group_paths
+        existing_path_set = {p for paths in existing_groups.values() for p in paths}
+        new_paths = all_paths - existing_path_set
+        logger.info(f"{len(new_paths):,} new paths to add to the workspace")
+        if not new_paths:
+            return
+        # Create new work groups
+        new_groups = []
+        current_group = []
+        for path in sorted(new_paths):
+            current_group.append(path)
+            if len(current_group) == items_per_group:
+                group_hash = self._compute_workgroup_hash(current_group)
+                new_groups.append((group_hash, current_group))
+                current_group = []
+        if current_group:
+            group_hash = self._compute_workgroup_hash(current_group)
+            new_groups.append((group_hash, current_group))
+        logger.info(f"Created {len(new_groups):,} new work groups")
+        # Combine and save updated work groups
+        combined_groups = existing_groups.copy()
+        for group_hash, group_paths in new_groups:
+            combined_groups[group_hash] = group_paths
+        combined_lines = [",".join([group_hash] + group_paths) for group_hash, group_paths in combined_groups.items()]
+        if new_groups:
+            # Write the combined data back to disk in zstd CSV format
+            await asyncio.to_thread(upload_zstd_csv_local, self._index_path, combined_lines)
+    async def initialize_queue(self) -> int:
+        """
+        Load the work queue from the local index file and initialize it for processing.
+        Removes already completed work items and randomizes the order.
+        """
+        # 1) Read the index
+        work_queue_lines = await asyncio.to_thread(download_zstd_csv_local, self._index_path)
+        work_queue = {parts[0]: parts[1:] for line in work_queue_lines if (parts := line.strip().split(",")) and line.strip()}
+        # 2) Determine which items are completed by scanning local results/*.jsonl
+        if not os.path.isdir(self._results_dir):
+            os.makedirs(self._results_dir, exist_ok=True)
+        done_work_items = [f for f in os.listdir(self._results_dir) if f.startswith("output_") and f.endswith(".jsonl")]
+        done_work_hashes = {fn[len("output_") : -len(".jsonl")] for fn in done_work_items}
+        # 3) Filter out completed items
+        remaining_work_hashes = set(work_queue) - done_work_hashes
+        remaining_items = [WorkItem(hash=hash_, work_paths=work_queue[hash_]) for hash_ in remaining_work_hashes]
+        random.shuffle(remaining_items)
+        # 4) Initialize our in-memory queue
+        self._queue = asyncio.Queue()
+        for item in remaining_items:
+            await self._queue.put(item)
+        logger.info(f"Initialized local queue with {self._queue.qsize()} work items")
+        return self._queue.qsize()
+    async def is_completed(self, work_hash: str) -> bool:
+        """
+        Check if a work item has been completed locally by seeing if
+        output_{work_hash}.jsonl is present in the results directory.
+        Args:
+            work_hash: Hash of the work item to check
+        """
+        output_file = os.path.join(self._results_dir, f"output_{work_hash}.jsonl")
+        return os.path.exists(output_file)
+    async def get_work(self, worker_lock_timeout_secs: int = 1800) -> Optional[WorkItem]:
+        """
+        Get the next available work item that isn't completed or locked.
+        Args:
+            worker_lock_timeout_secs: Number of seconds before considering
+                                      a worker lock stale (default 30 mins)
+        Returns:
+            WorkItem if work is available, None if queue is empty
+        """
+        while True:
+            try:
+                work_item = self._queue.get_nowait()
+            except asyncio.QueueEmpty:
+                return None
+            # Check if work is already completed
+            if await self.is_completed(work_item.hash):
+                logger.debug(f"Work item {work_item.hash} already completed, skipping")
+                self._queue.task_done()
+                continue
+            # Check for worker lock
+            lock_file = os.path.join(self._locks_dir, f"output_{work_item.hash}.jsonl")
+            if os.path.exists(lock_file):
+                # Check modification time
+                mtime = datetime.datetime.fromtimestamp(os.path.getmtime(lock_file), datetime.timezone.utc)
+                if (datetime.datetime.now(datetime.timezone.utc) - mtime).total_seconds() > worker_lock_timeout_secs:
+                    # Lock is stale, we can take this work
+                    logger.debug(f"Found stale lock for {work_item.hash}, taking work item")
+                else:
+                    # Lock is active, skip this work
+                    logger.debug(f"Work item {work_item.hash} is locked by another worker, skipping")
+                    self._queue.task_done()
+                    continue
+            # Create our lock file (touch an empty file)
+            try:
+                with open(lock_file, "wb") as f:
+                    f.write(b"")
+            except Exception as e:
+                logger.warning(f"Failed to create lock file for {work_item.hash}: {e}")
+                self._queue.task_done()
+                continue
+            return work_item
+    async def mark_done(self, work_item: WorkItem) -> None:
+        """
+        Mark a work item as done by removing its lock file.
+        Args:
+            work_item: The WorkItem to mark as done
+        """
+        lock_file = os.path.join(self._locks_dir, f"output_{work_item.hash}.jsonl")
+        if os.path.exists(lock_file):
+            try:
+                os.remove(lock_file)
+            except Exception as e:
+                logger.warning(f"Failed to delete lock file for {work_item.hash}: {e}")
+        self._queue.task_done()
+    @property
+    def size(self) -> int:
+        """Get current size of local work queue"""
+        return self._queue.qsize()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,75 @@

+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "ocrflux"
+description = "Fast, efficient, and high quality OCR powered by open visual language models"
+version = "0.1.0"
+readme = "README.md"
+classifiers = [
+    "Intended Audience :: Science/Research",
+    "Development Status :: 3 - Alpha",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+authors = [
+    {name = "Yu Tang", email = "[email protected]"}
+]
+requires-python = ">=3.11"
+dependencies = [
+  "cached-path",
+  "smart_open",
+  "pypdf>=5.2.0",
+  "pypdfium2",
+  "cryptography",
+  "lingua-language-detector",
+  "Pillow",
+  "ftfy",
+  "bleach",
+  "markdown2",
+  "filelock",
+  "orjson",
+  "requests",
+  "zstandard",
+  "boto3",
+  "httpx",
+  "torch>=2.5.1",
+  "transformers==4.50.0",
+  "vllm==0.7.3",
+  "img2pdf",
+  "nltk",
+  "bs4",
+  "distance",
+  "apted",
+  "gradio",
+  "gradio_pdf",
+]
+license = {file = "LICENSE"}
+[project.urls]
+Homepage = "https://github.com/chatdoc-com/OCRFlux"
+Repository = "https://github.com/chatdoc-com/OCRFlux"
+[tool.setuptools.packages.find]
+exclude = [
+    "*.tests",
+    "*.tests.*",
+    "tests.*",
+    "tests",
+    "docs*",
+    "scripts*",
+    "images*"
+]
+[tool.setuptools]
+include-package-data = true
+[tool.setuptools.package-data]
+ocrflux = [
+    "py.typed",
+]
+[tool.black]
+line-length = 79