ibrahim313 commited on
Commit
120db54
Β·
verified Β·
1 Parent(s): edf2b63

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +153 -0
app.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OLM-CLLM OCR – Gradio Space
3
+ Upload any PDF ➜ get clean, linearised text.
4
+
5
+ πŸš€ Model: allenai/olmOCR-7B-0225-preview
6
+ πŸ”§ Prompts / render helpers come from the `olmocr` toolkit
7
+ """
8
+
9
+ import json, base64, tempfile, os, gc
10
+ from io import BytesIO
11
+
12
+ import gradio as gr
13
+ import torch
14
+ from PIL import Image
15
+ from pypdf import PdfReader
16
+
17
+ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
18
+ from olmocr.data.renderpdf import render_pdf_to_base64png # page β†’ base64 PNG
19
+ from olmocr.prompts.anchor import get_anchor_text # page β†’ anchor text
20
+ from olmocr.prompts import build_finetuning_prompt # anchor β†’ final prompt
21
+
22
+ # ---------- 1. Model & processor (load once, then stay in memory) ----------
23
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
+
25
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
26
+ "allenai/olmOCR-7B-0225-preview",
27
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
28
+ ).to(device).eval()
29
+
30
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
31
+
32
+ # ---------- 2. Utility ------------------------------------------------------
33
+ def _decode_llm_json(raw_str: str) -> str:
34
+ """
35
+ olmOCR returns a JSON string like:
36
+ {
37
+ "primary_language": "...",
38
+ ...
39
+ "natural_text": "THE ACTUAL PAGE TEXT"
40
+ }
41
+ Pull out the `natural_text` field; fall back to raw string if parsing fails.
42
+ """
43
+ try:
44
+ page_json = json.loads(raw_str.strip())
45
+ return page_json.get("natural_text") or ""
46
+ except Exception:
47
+ return raw_str.strip()
48
+
49
+ # ---------- 3. Core pipeline ------------------------------------------------
50
+ def pdf_to_text(pdf_file):
51
+ """
52
+ β€’ Save uploaded file to a temp path (toolkit needs a real path)
53
+ β€’ Iterate over pages
54
+ β€’ For each page:
55
+ – render page image β†’ base64
56
+ – generate anchor text in-page
57
+ – build prompt (+ image) and run the model
58
+ – collect `natural_text`
59
+ β€’ Return merged text
60
+ """
61
+
62
+ if pdf_file is None:
63
+ return "⬆️ Please upload a PDF first."
64
+
65
+ with tempfile.TemporaryDirectory() as tmpdir:
66
+ local_pdf_path = os.path.join(tmpdir, "input.pdf")
67
+ with open(local_pdf_path, "wb") as f:
68
+ f.write(pdf_file.read())
69
+
70
+ reader = PdfReader(local_pdf_path)
71
+ n_pages = len(reader.pages)
72
+
73
+ extracted_pages = []
74
+
75
+ for page_idx in range(1, n_pages + 1): # 1-indexed
76
+ # a. Image
77
+ img_b64 = render_pdf_to_base64png(
78
+ local_pdf_path, page_idx, target_longest_image_dim=1024
79
+ )
80
+ page_image = Image.open(BytesIO(base64.b64decode(img_b64)))
81
+
82
+ # b. Anchor text & prompt
83
+ anchor = get_anchor_text(
84
+ local_pdf_path,
85
+ page_idx,
86
+ pdf_engine="pdfreport", # uses pypdf / pdfium, no Poppler dependency
87
+ target_length=4000,
88
+ )
89
+ prompt = build_finetuning_prompt(anchor)
90
+
91
+ messages = [
92
+ {
93
+ "role": "user",
94
+ "content": [
95
+ {"type": "text", "text": prompt},
96
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}},
97
+ ],
98
+ }
99
+ ]
100
+
101
+ # c. Tokenise + generate
102
+ text_in = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
103
+ inputs = processor(text=[text_in], images=[page_image], return_tensors="pt", padding=True)
104
+ inputs = {k: v.to(device) for k, v in inputs.items()}
105
+
106
+ with torch.no_grad():
107
+ gen = model.generate(
108
+ **inputs,
109
+ temperature=0.2,
110
+ max_new_tokens=512,
111
+ do_sample=False,
112
+ )
113
+
114
+ prompt_len = inputs["input_ids"].shape[1]
115
+ new_tokens = gen[:, prompt_len:]
116
+ raw_out = processor.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
117
+
118
+ extracted_pages.append(_decode_llm_json(raw_out))
119
+
120
+ # optional memory clean-up per page
121
+ del inputs, gen
122
+ gc.collect()
123
+ torch.cuda.empty_cache() if torch.cuda.is_available() else None
124
+
125
+ return "\n\n".join(extracted_pages) or "πŸ€” Nothing returned."
126
+
127
+ # ---------- 4. Gradio UI ----------------------------------------------------
128
+ with gr.Blocks(title="olmOCR 7B PDF Extractor") as demo:
129
+ gr.Markdown(
130
+ """
131
+ # 🧠 **OLM-CLLM OCR**
132
+ Upload a PDF → get high-quality, linearised text (tables β†’ Markdown, equations β†’ LaTeX).
133
+ Fine-tuned Vision-LLM: **allenai/olmOCR-7B-0225-preview**.
134
+ """
135
+ )
136
+
137
+ with gr.Row():
138
+ with gr.Column(scale=1):
139
+ up = gr.File(label="πŸ“„ Upload PDF", file_types=[".pdf"])
140
+ go = gr.Button("Extract Text", variant="primary", size="lg")
141
+ with gr.Column(scale=2):
142
+ out = gr.Textbox(
143
+ label="πŸ“œ Extracted text",
144
+ lines=25,
145
+ interactive=False,
146
+ show_copy_button=True,
147
+ )
148
+
149
+ go.click(pdf_to_text, inputs=up, outputs=out)
150
+
151
+ # ---------- 5. Launch locally (Space will ignore this) ----------------------
152
+ if __name__ == "__main__":
153
+ demo.launch()