zhangyue66
commited on
Commit
·
4fe79f6
1
Parent(s):
1787ca5
update
Browse files
README.md
CHANGED
|
@@ -149,65 +149,98 @@ Currently, we support inference using the PaddleOCR-VL-0.9B model with the `tran
|
|
| 149 |
> [!NOTE]
|
| 150 |
> Note: We currently recommend using the official method for inference, as it is faster and supports page-level document parsing. The example code below only supports element-level recognition.
|
| 151 |
|
| 152 |
-
```bash
|
| 153 |
-
# 1- ensure the flash-attn2 is installed
|
| 154 |
-
!uv pip install -q "transformers>=4.55" bitsandbytes accelerate
|
| 155 |
-
!uv pip install flash-attn --no-build-isolation
|
| 156 |
-
```
|
| 157 |
-
|
| 158 |
```python
|
| 159 |
-
|
| 160 |
import torch
|
| 161 |
from transformers import AutoModelForCausalLM, AutoProcessor
|
| 162 |
-
from PIL import Image
|
| 163 |
-
from google.colab import files
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
-
|
| 167 |
-
uploaded = files.upload()
|
| 168 |
-
image_path = list(uploaded.keys())[-1]
|
| 169 |
-
print(f"Using: {image_path}")
|
| 170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
-
|
| 173 |
-
img = Image.open(image_path).convert("RGB")
|
| 174 |
-
max_size = 2048
|
| 175 |
-
w, h = img.size
|
| 176 |
-
if max(w, h) > max_size:
|
| 177 |
-
scale = max_size / max(w, h)
|
| 178 |
-
new_w, new_h = int(w * scale), int(h * scale)
|
| 179 |
-
img = img.resize((new_w, new_h), Image.LANCZOS)
|
| 180 |
-
print(f"Resized → {img.size[0]}×{img.size[1]}")
|
| 181 |
-
print(f"current dim → {img.size[0]}×{img.size[1]}")
|
| 182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
-
#4. Load model
|
| 185 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 186 |
|
| 187 |
model = AutoModelForCausalLM.from_pretrained(
|
| 188 |
-
|
| 189 |
trust_remote_code=True,
|
| 190 |
torch_dtype=torch.bfloat16,
|
| 191 |
attn_implementation="flash_attention_2",
|
| 192 |
).to(dtype=torch.bfloat16, device=DEVICE).eval()
|
|
|
|
| 193 |
|
| 194 |
-
processor = AutoProcessor.from_pretrained("PaddlePaddle/PaddleOCR-VL", trust_remote_code=True)
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
# 5. Choose task
|
| 200 |
-
TASK = "ocr" # ← change to "table" | "chart" | "formula"
|
| 201 |
PROMPTS = {
|
| 202 |
"ocr": "OCR:",
|
| 203 |
"table": "Table Recognition:",
|
| 204 |
"chart": "Chart Recognition:",
|
| 205 |
"formula": "Formula Recognition:",
|
| 206 |
}
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
|
| 212 |
inputs = processor.apply_chat_template(
|
| 213 |
messages,
|
|
@@ -217,10 +250,6 @@ inputs = processor.apply_chat_template(
|
|
| 217 |
return_tensors="pt"
|
| 218 |
).to(DEVICE)
|
| 219 |
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
# 7. Run inference
|
| 224 |
with torch.inference_mode():
|
| 225 |
out = model.generate(
|
| 226 |
**inputs,
|
|
@@ -229,11 +258,11 @@ with torch.inference_mode():
|
|
| 229 |
use_cache=True
|
| 230 |
)
|
| 231 |
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
|
| 238 |
## Performance
|
| 239 |
|
|
|
|
| 149 |
> [!NOTE]
|
| 150 |
> Note: We currently recommend using the official method for inference, as it is faster and supports page-level document parsing. The example code below only supports element-level recognition.
|
| 151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
```python
|
| 153 |
+
from PIL import Image
|
| 154 |
import torch
|
| 155 |
from transformers import AutoModelForCausalLM, AutoProcessor
|
|
|
|
|
|
|
| 156 |
|
| 157 |
+
# ---- Settings ----
|
| 158 |
+
model_path = "PaddlePaddle/PaddleOCR-VL"
|
| 159 |
+
image_path = "test.png"
|
| 160 |
+
task = "ocr" # Options: 'ocr' | 'table' | 'chart' | 'formula'
|
| 161 |
+
# ------------------
|
| 162 |
|
| 163 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
|
|
|
|
|
|
| 164 |
|
| 165 |
+
PROMPTS = {
|
| 166 |
+
"ocr": "OCR:",
|
| 167 |
+
"table": "Table Recognition:",
|
| 168 |
+
"formula": "Formula Recognition:",
|
| 169 |
+
"chart": "Chart Recognition:",
|
| 170 |
+
}
|
| 171 |
|
| 172 |
+
image = Image.open(image_path).convert("RGB")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 175 |
+
model_path, trust_remote_code=True, torch_dtype=torch.bfloat16
|
| 176 |
+
).to(DEVICE).eval()
|
| 177 |
+
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
|
| 178 |
+
|
| 179 |
+
messages = [
|
| 180 |
+
{"role": "user",
|
| 181 |
+
"content": [
|
| 182 |
+
{"type": "image", "image": image},
|
| 183 |
+
{"type": "text", "text": PROMPTS[task]},
|
| 184 |
+
]
|
| 185 |
+
}
|
| 186 |
+
]
|
| 187 |
+
inputs = processor.apply_chat_template(
|
| 188 |
+
messages,
|
| 189 |
+
tokenize=True,
|
| 190 |
+
add_generation_prompt=True,
|
| 191 |
+
return_dict=True,
|
| 192 |
+
return_tensors="pt"
|
| 193 |
+
).to(DEVICE)
|
| 194 |
+
|
| 195 |
+
outputs = model.generate(**inputs, max_new_tokens=1024)
|
| 196 |
+
outputs = processor.batch_decode(outputs, skip_special_tokens=True)[0]
|
| 197 |
+
print(outputs)
|
| 198 |
+
```
|
| 199 |
+
|
| 200 |
+
<details>
|
| 201 |
+
<summary>👉 Click to expand: Use flash-attn to boost performance and reduce memory usage</summary>
|
| 202 |
+
|
| 203 |
+
<pre><code>
|
| 204 |
+
# ensure the flash-attn2 is installed
|
| 205 |
+
pip install flash-attn --no-build-isolation
|
| 206 |
+
</code></pre>
|
| 207 |
+
|
| 208 |
+
<pre><code>
|
| 209 |
+
import torch
|
| 210 |
+
from transformers import AutoModelForCausalLM, AutoProcessor
|
| 211 |
+
from PIL import Image
|
| 212 |
+
|
| 213 |
+
# ---- Settings ----
|
| 214 |
+
model_path = "PaddlePaddle/PaddleOCR-VL"
|
| 215 |
+
image_path = "test.png"
|
| 216 |
+
task = "ocr" # ← change to "table" | "chart" | "formula"
|
| 217 |
+
# ------------------
|
| 218 |
|
|
|
|
| 219 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 220 |
|
| 221 |
model = AutoModelForCausalLM.from_pretrained(
|
| 222 |
+
model_path,
|
| 223 |
trust_remote_code=True,
|
| 224 |
torch_dtype=torch.bfloat16,
|
| 225 |
attn_implementation="flash_attention_2",
|
| 226 |
).to(dtype=torch.bfloat16, device=DEVICE).eval()
|
| 227 |
+
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
|
| 228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
PROMPTS = {
|
| 230 |
"ocr": "OCR:",
|
| 231 |
"table": "Table Recognition:",
|
| 232 |
"chart": "Chart Recognition:",
|
| 233 |
"formula": "Formula Recognition:",
|
| 234 |
}
|
| 235 |
+
messages = [
|
| 236 |
+
{
|
| 237 |
+
"role": "user",
|
| 238 |
+
"content": [
|
| 239 |
+
{"type": "image", "image": Image.open(image_path).convert("RGB")},
|
| 240 |
+
{"type": "text", "text": PROMPTS[task]}
|
| 241 |
+
]
|
| 242 |
+
}
|
| 243 |
+
]
|
| 244 |
|
| 245 |
inputs = processor.apply_chat_template(
|
| 246 |
messages,
|
|
|
|
| 250 |
return_tensors="pt"
|
| 251 |
).to(DEVICE)
|
| 252 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
with torch.inference_mode():
|
| 254 |
out = model.generate(
|
| 255 |
**inputs,
|
|
|
|
| 258 |
use_cache=True
|
| 259 |
)
|
| 260 |
|
| 261 |
+
outputs = processor.batch_decode(out, skip_special_tokens=True)[0]
|
| 262 |
+
print(outputs)
|
| 263 |
+
</code></pre>
|
| 264 |
+
|
| 265 |
+
</details>
|
| 266 |
|
| 267 |
## Performance
|
| 268 |
|