Spaces:

ibrahim313
/

olmOcR_grinda

Runtime error

App Files Files Community

ibrahim313 commited on Apr 28

Commit

120db54

verified ·

1 Parent(s): edf2b63

Create app.py

Browse files

Files changed (1) hide show

app.py +153 -0

app.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""
+OLM-CLLM OCR – Gradio Space
+Upload any PDF ➜ get clean, linearised text.
+🚀  Model: allenai/olmOCR-7B-0225-preview
+🔧  Prompts / render helpers come from the `olmocr` toolkit
+"""
+import json, base64, tempfile, os, gc
+from io import BytesIO
+import gradio as gr
+import torch
+from PIL import Image
+from pypdf import PdfReader
+from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+from olmocr.data.renderpdf import render_pdf_to_base64png          # page → base64 PNG
+from olmocr.prompts.anchor import get_anchor_text                  # page → anchor text
+from olmocr.prompts import build_finetuning_prompt                 # anchor → final prompt
+# ---------- 1. Model & processor (load once, then stay in memory) ----------
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = Qwen2VLForConditionalGeneration.from_pretrained(
+    "allenai/olmOCR-7B-0225-preview",
+    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+).to(device).eval()
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+# ---------- 2. Utility ------------------------------------------------------
+def _decode_llm_json(raw_str: str) -> str:
+    """
+    olmOCR returns a JSON string like:
+    {
+        "primary_language": "...",
+        ...
+        "natural_text": "THE ACTUAL PAGE TEXT"
+    }
+    Pull out the `natural_text` field; fall back to raw string if parsing fails.
+    """
+    try:
+        page_json = json.loads(raw_str.strip())
+        return page_json.get("natural_text") or ""
+    except Exception:
+        return raw_str.strip()
+# ---------- 3. Core pipeline ------------------------------------------------
+def pdf_to_text(pdf_file):
+    """
+    • Save uploaded file to a temp path (toolkit needs a real path)
+    • Iterate over pages
+    • For each page:
+        – render page image → base64
+        – generate anchor text in-page
+        – build prompt (+ image) and run the model
+        – collect `natural_text`
+    • Return merged text
+    """
+    if pdf_file is None:
+        return "⬆️ Please upload a PDF first."
+    with tempfile.TemporaryDirectory() as tmpdir:
+        local_pdf_path = os.path.join(tmpdir, "input.pdf")
+        with open(local_pdf_path, "wb") as f:
+            f.write(pdf_file.read())
+        reader = PdfReader(local_pdf_path)
+        n_pages = len(reader.pages)
+        extracted_pages = []
+        for page_idx in range(1, n_pages + 1):          # 1-indexed
+            # a. Image
+            img_b64 = render_pdf_to_base64png(
+                local_pdf_path, page_idx, target_longest_image_dim=1024
+            )
+            page_image = Image.open(BytesIO(base64.b64decode(img_b64)))
+            # b. Anchor text & prompt
+            anchor = get_anchor_text(
+                local_pdf_path,
+                page_idx,
+                pdf_engine="pdfreport",                 # uses pypdf / pdfium, no Poppler dependency
+                target_length=4000,
+            )
+            prompt = build_finetuning_prompt(anchor)
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text",  "text": prompt},
+                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}},
+                    ],
+                }
+            ]
+            # c. Tokenise + generate
+            text_in = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            inputs = processor(text=[text_in], images=[page_image], return_tensors="pt", padding=True)
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+            with torch.no_grad():
+                gen = model.generate(
+                    **inputs,
+                    temperature=0.2,
+                    max_new_tokens=512,
+                    do_sample=False,
+                )
+            prompt_len = inputs["input_ids"].shape[1]
+            new_tokens = gen[:, prompt_len:]
+            raw_out   = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
+            extracted_pages.append(_decode_llm_json(raw_out))
+            # optional memory clean-up per page
+            del inputs, gen
+            gc.collect()
+            torch.cuda.empty_cache() if torch.cuda.is_available() else None
+    return "\n\n".join(extracted_pages) or "🤔 Nothing returned."
+# ---------- 4. Gradio UI ----------------------------------------------------
+with gr.Blocks(title="olmOCR 7B PDF Extractor") as demo:
+    gr.Markdown(
+        """
+        # 🧠 **OLM-CLLM OCR**
+        Upload a PDF &rarr; get high-quality, linearised text (tables → Markdown, equations → LaTeX).
+        Fine-tuned Vision-LLM: **allenai/olmOCR-7B-0225-preview**.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            up = gr.File(label="📄 Upload PDF", file_types=[".pdf"])
+            go = gr.Button("Extract Text", variant="primary", size="lg")
+        with gr.Column(scale=2):
+            out = gr.Textbox(
+                label="📜 Extracted text",
+                lines=25,
+                interactive=False,
+                show_copy_button=True,
+            )
+    go.click(pdf_to_text, inputs=up, outputs=out)
+# ---------- 5. Launch locally (Space will ignore this) ----------------------
+if __name__ == "__main__":
+    demo.launch()