update

Files changed (1) hide show

README.md +74 -45

README.md CHANGED Viewed

@@ -149,65 +149,98 @@ Currently, we support inference using the PaddleOCR-VL-0.9B model with the `tran
 > [!NOTE]
 > Note: We currently recommend using the official method for inference, as it is faster and supports page-level document parsing. The example code below only supports element-level recognition.
-```bash
-# 1- ensure the flash-attn2 is installed
-!uv pip install -q "transformers>=4.55" bitsandbytes accelerate
-!uv pip install flash-attn --no-build-isolation
-```
 ```python
-# 1.2 import the necessary libraries
 import torch
 from transformers import AutoModelForCausalLM, AutoProcessor
-from PIL import Image
-from google.colab import files
-# 2- Upload image (drag & drop any PNG/JPG)
-uploaded = files.upload()
-image_path = list(uploaded.keys())[-1]
-print(f"Using: {image_path}")
-# 3. Resize max-2048 preserving aspect ratio
-img = Image.open(image_path).convert("RGB")
-max_size = 2048
-w, h = img.size
-if max(w, h) > max_size:
-    scale = max_size / max(w, h)
-    new_w, new_h = int(w * scale), int(h * scale)
-    img = img.resize((new_w, new_h), Image.LANCZOS)
-    print(f"Resized → {img.size[0]}×{img.size[1]}")
-print(f"current dim → {img.size[0]}×{img.size[1]}")
-#4. Load model
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 model = AutoModelForCausalLM.from_pretrained(
-    "PaddlePaddle/PaddleOCR-VL",
     trust_remote_code=True,
     torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2",
 ).to(dtype=torch.bfloat16, device=DEVICE).eval()
-processor = AutoProcessor.from_pretrained("PaddlePaddle/PaddleOCR-VL", trust_remote_code=True)
-# 5. Choose task
-TASK = "ocr"        # ← change to "table" | "chart" | "formula"
 PROMPTS = {
     "ocr": "OCR:",
     "table": "Table Recognition:",
     "chart": "Chart Recognition:",
     "formula": "Formula Recognition:",
 }
-# 6. Run inference
-messages = [{"role": "user", "content": [{"type": "image", "image": img},
-                                         {"type": "text",  "text": PROMPTS[TASK]}]}]
 inputs = processor.apply_chat_template(
     messages,
@@ -217,10 +250,6 @@ inputs = processor.apply_chat_template(
     return_tensors="pt"
 ).to(DEVICE)
-# 7. Run inference
 with torch.inference_mode():
     out = model.generate(
         **inputs,
@@ -229,11 +258,11 @@ with torch.inference_mode():
         use_cache=True
     )
-# 8. Decode the output
-result = processor.batch_decode(out, skip_special_tokens=True)[0]
-print("\n" + "="*60 + "\nRESULT:\n" + "="*60)
-print(result)
-```
 ## Performance

 > [!NOTE]
 > Note: We currently recommend using the official method for inference, as it is faster and supports page-level document parsing. The example code below only supports element-level recognition.
 ```python
+from PIL import Image
 import torch
 from transformers import AutoModelForCausalLM, AutoProcessor
+# ---- Settings ----
+model_path = "PaddlePaddle/PaddleOCR-VL"
+image_path = "test.png"
+task = "ocr" # Options: 'ocr' | 'table' | 'chart' | 'formula'
+# ------------------
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+PROMPTS = {
+    "ocr": "OCR:",
+    "table": "Table Recognition:",
+    "formula": "Formula Recognition:",
+    "chart": "Chart Recognition:",
+}
+image = Image.open(image_path).convert("RGB")
+model = AutoModelForCausalLM.from_pretrained(
+    model_path, trust_remote_code=True, torch_dtype=torch.bfloat16
+).to(DEVICE).eval()
+processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+messages = [
+    {"role": "user",
+     "content": [
+            {"type": "image", "image": image},
+            {"type": "text", "text": PROMPTS[task]},
+        ]
+    }
+]
+inputs = processor.apply_chat_template(
+    messages,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+    return_tensors="pt"
+).to(DEVICE)
+outputs = model.generate(**inputs, max_new_tokens=1024)
+outputs = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+print(outputs)
+```
+<details>
+<summary>👉 Click to expand: Use flash-attn to boost performance and reduce memory usage</summary>
+<pre><code>
+# ensure the flash-attn2 is installed
+pip install flash-attn --no-build-isolation
+</code></pre>
+<pre><code>
+import torch
+from transformers import AutoModelForCausalLM, AutoProcessor
+from PIL import Image
+# ---- Settings ----
+model_path = "PaddlePaddle/PaddleOCR-VL"
+image_path = "test.png"
+task = "ocr" # ← change to "table" | "chart" | "formula"
+# ------------------
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 model = AutoModelForCausalLM.from_pretrained(
+    model_path,
     trust_remote_code=True,
     torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2",
 ).to(dtype=torch.bfloat16, device=DEVICE).eval()
+processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
 PROMPTS = {
     "ocr": "OCR:",
     "table": "Table Recognition:",
     "chart": "Chart Recognition:",
     "formula": "Formula Recognition:",
 }
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "image": Image.open(image_path).convert("RGB")},
+            {"type": "text",  "text": PROMPTS[task]}
+        ]
+    }
+]
 inputs = processor.apply_chat_template(
     messages,
     return_tensors="pt"
 ).to(DEVICE)
 with torch.inference_mode():
     out = model.generate(
         **inputs,
         use_cache=True
     )
+outputs = processor.batch_decode(out, skip_special_tokens=True)[0]
+print(outputs)
+</code></pre>
+</details>
 ## Performance