zhangyue66 commited on
Commit
4fe79f6
·
1 Parent(s): 1787ca5
Files changed (1) hide show
  1. README.md +74 -45
README.md CHANGED
@@ -149,65 +149,98 @@ Currently, we support inference using the PaddleOCR-VL-0.9B model with the `tran
149
  > [!NOTE]
150
  > Note: We currently recommend using the official method for inference, as it is faster and supports page-level document parsing. The example code below only supports element-level recognition.
151
 
152
- ```bash
153
- # 1- ensure the flash-attn2 is installed
154
- !uv pip install -q "transformers>=4.55" bitsandbytes accelerate
155
- !uv pip install flash-attn --no-build-isolation
156
- ```
157
-
158
  ```python
159
- # 1.2 import the necessary libraries
160
  import torch
161
  from transformers import AutoModelForCausalLM, AutoProcessor
162
- from PIL import Image
163
- from google.colab import files
164
 
 
 
 
 
 
165
 
166
- # 2- Upload image (drag & drop any PNG/JPG)
167
- uploaded = files.upload()
168
- image_path = list(uploaded.keys())[-1]
169
- print(f"Using: {image_path}")
170
 
 
 
 
 
 
 
171
 
172
- # 3. Resize max-2048 preserving aspect ratio
173
- img = Image.open(image_path).convert("RGB")
174
- max_size = 2048
175
- w, h = img.size
176
- if max(w, h) > max_size:
177
- scale = max_size / max(w, h)
178
- new_w, new_h = int(w * scale), int(h * scale)
179
- img = img.resize((new_w, new_h), Image.LANCZOS)
180
- print(f"Resized → {img.size[0]}×{img.size[1]}")
181
- print(f"current dim → {img.size[0]}×{img.size[1]}")
182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
- #4. Load model
185
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
186
 
187
  model = AutoModelForCausalLM.from_pretrained(
188
- "PaddlePaddle/PaddleOCR-VL",
189
  trust_remote_code=True,
190
  torch_dtype=torch.bfloat16,
191
  attn_implementation="flash_attention_2",
192
  ).to(dtype=torch.bfloat16, device=DEVICE).eval()
 
193
 
194
- processor = AutoProcessor.from_pretrained("PaddlePaddle/PaddleOCR-VL", trust_remote_code=True)
195
-
196
-
197
-
198
-
199
- # 5. Choose task
200
- TASK = "ocr" # ← change to "table" | "chart" | "formula"
201
  PROMPTS = {
202
  "ocr": "OCR:",
203
  "table": "Table Recognition:",
204
  "chart": "Chart Recognition:",
205
  "formula": "Formula Recognition:",
206
  }
207
-
208
- # 6. Run inference
209
- messages = [{"role": "user", "content": [{"type": "image", "image": img},
210
- {"type": "text", "text": PROMPTS[TASK]}]}]
 
 
 
 
 
211
 
212
  inputs = processor.apply_chat_template(
213
  messages,
@@ -217,10 +250,6 @@ inputs = processor.apply_chat_template(
217
  return_tensors="pt"
218
  ).to(DEVICE)
219
 
220
-
221
-
222
-
223
- # 7. Run inference
224
  with torch.inference_mode():
225
  out = model.generate(
226
  **inputs,
@@ -229,11 +258,11 @@ with torch.inference_mode():
229
  use_cache=True
230
  )
231
 
232
- # 8. Decode the output
233
- result = processor.batch_decode(out, skip_special_tokens=True)[0]
234
- print("\n" + "="*60 + "\nRESULT:\n" + "="*60)
235
- print(result)
236
- ```
237
 
238
  ## Performance
239
 
 
149
  > [!NOTE]
150
  > Note: We currently recommend using the official method for inference, as it is faster and supports page-level document parsing. The example code below only supports element-level recognition.
151
 
 
 
 
 
 
 
152
  ```python
153
+ from PIL import Image
154
  import torch
155
  from transformers import AutoModelForCausalLM, AutoProcessor
 
 
156
 
157
+ # ---- Settings ----
158
+ model_path = "PaddlePaddle/PaddleOCR-VL"
159
+ image_path = "test.png"
160
+ task = "ocr" # Options: 'ocr' | 'table' | 'chart' | 'formula'
161
+ # ------------------
162
 
163
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
164
 
165
+ PROMPTS = {
166
+ "ocr": "OCR:",
167
+ "table": "Table Recognition:",
168
+ "formula": "Formula Recognition:",
169
+ "chart": "Chart Recognition:",
170
+ }
171
 
172
+ image = Image.open(image_path).convert("RGB")
 
 
 
 
 
 
 
 
 
173
 
174
+ model = AutoModelForCausalLM.from_pretrained(
175
+ model_path, trust_remote_code=True, torch_dtype=torch.bfloat16
176
+ ).to(DEVICE).eval()
177
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
178
+
179
+ messages = [
180
+ {"role": "user",
181
+ "content": [
182
+ {"type": "image", "image": image},
183
+ {"type": "text", "text": PROMPTS[task]},
184
+ ]
185
+ }
186
+ ]
187
+ inputs = processor.apply_chat_template(
188
+ messages,
189
+ tokenize=True,
190
+ add_generation_prompt=True,
191
+ return_dict=True,
192
+ return_tensors="pt"
193
+ ).to(DEVICE)
194
+
195
+ outputs = model.generate(**inputs, max_new_tokens=1024)
196
+ outputs = processor.batch_decode(outputs, skip_special_tokens=True)[0]
197
+ print(outputs)
198
+ ```
199
+
200
+ <details>
201
+ <summary>👉 Click to expand: Use flash-attn to boost performance and reduce memory usage</summary>
202
+
203
+ <pre><code>
204
+ # ensure the flash-attn2 is installed
205
+ pip install flash-attn --no-build-isolation
206
+ </code></pre>
207
+
208
+ <pre><code>
209
+ import torch
210
+ from transformers import AutoModelForCausalLM, AutoProcessor
211
+ from PIL import Image
212
+
213
+ # ---- Settings ----
214
+ model_path = "PaddlePaddle/PaddleOCR-VL"
215
+ image_path = "test.png"
216
+ task = "ocr" # ← change to "table" | "chart" | "formula"
217
+ # ------------------
218
 
 
219
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
220
 
221
  model = AutoModelForCausalLM.from_pretrained(
222
+ model_path,
223
  trust_remote_code=True,
224
  torch_dtype=torch.bfloat16,
225
  attn_implementation="flash_attention_2",
226
  ).to(dtype=torch.bfloat16, device=DEVICE).eval()
227
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
228
 
 
 
 
 
 
 
 
229
  PROMPTS = {
230
  "ocr": "OCR:",
231
  "table": "Table Recognition:",
232
  "chart": "Chart Recognition:",
233
  "formula": "Formula Recognition:",
234
  }
235
+ messages = [
236
+ {
237
+ "role": "user",
238
+ "content": [
239
+ {"type": "image", "image": Image.open(image_path).convert("RGB")},
240
+ {"type": "text", "text": PROMPTS[task]}
241
+ ]
242
+ }
243
+ ]
244
 
245
  inputs = processor.apply_chat_template(
246
  messages,
 
250
  return_tensors="pt"
251
  ).to(DEVICE)
252
 
 
 
 
 
253
  with torch.inference_mode():
254
  out = model.generate(
255
  **inputs,
 
258
  use_cache=True
259
  )
260
 
261
+ outputs = processor.batch_decode(out, skip_special_tokens=True)[0]
262
+ print(outputs)
263
+ </code></pre>
264
+
265
+ </details>
266
 
267
  ## Performance
268