docs: Readme Updated for optimized Usage with transformers library

python code for transformers usage updated to use flash-attn as attention implementation to boost the performance and reduce memory usage.

Files changed (1) hide show

README.md +64 -23

README.md CHANGED Viewed

@@ -149,49 +149,90 @@ Currently, we support inference using the PaddleOCR-VL-0.9B model with the `tran
 > [!NOTE]
 > Note: We currently recommend using the official method for inference, as it is faster and supports page-level document parsing. The example code below only supports element-level recognition.
 ```python
-from PIL import Image
 import torch
 from transformers import AutoModelForCausalLM, AutoProcessor
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-CHOSEN_TASK = "ocr"  # Options: 'ocr' | 'table' | 'chart' | 'formula'
 PROMPTS = {
     "ocr": "OCR:",
     "table": "Table Recognition:",
-    "formula": "Formula Recognition:",
     "chart": "Chart Recognition:",
 }
-model_path = "PaddlePaddle/PaddleOCR-VL"
-image_path = "test.png"
-image = Image.open(image_path).convert("RGB")
-model = AutoModelForCausalLM.from_pretrained(
-    model_path, trust_remote_code=True, torch_dtype=torch.bfloat16
-).to(DEVICE).eval()
-processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
-messages = [
-    {"role": "user",
-     "content": [
-            {"type": "image", "image": image},
-            {"type": "text", "text": PROMPTS[CHOSEN_TASK]},
-        ]
-    }
-]
 inputs = processor.apply_chat_template(
     messages,
     tokenize=True,
     add_generation_prompt=True,
     return_dict=True,
-	return_tensors="pt"
 ).to(DEVICE)
-outputs = model.generate(**inputs, max_new_tokens=1024)
-outputs = processor.batch_decode(outputs, skip_special_tokens=True)[0]
-print(outputs)
 ```
 ## Performance

 > [!NOTE]
 > Note: We currently recommend using the official method for inference, as it is faster and supports page-level document parsing. The example code below only supports element-level recognition.
+```bash
+# 1- ensure the flash-attn2 is installed
+!uv pip install -q "transformers>=4.55" bitsandbytes accelerate
+!uv pip install flash-attn --no-build-isolation
+```
 ```python
+# 1.2 import the necessary libraries
 import torch
 from transformers import AutoModelForCausalLM, AutoProcessor
+from PIL import Image
+from google.colab import files
+# 2- Upload image (drag & drop any PNG/JPG)
+uploaded = files.upload()
+image_path = list(uploaded.keys())[-1]
+print(f"Using: {image_path}")
+# 3. Resize max-2048 preserving aspect ratio
+img = Image.open(image_path).convert("RGB")
+max_size = 2048
+w, h = img.size
+if max(w, h) > max_size:
+    scale = max_size / max(w, h)
+    new_w, new_h = int(w * scale), int(h * scale)
+    img = img.resize((new_w, new_h), Image.LANCZOS)
+    print(f"Resized → {img.size[0]}×{img.size[1]}")
+print(f"current dim → {img.size[0]}×{img.size[1]}")
+#4. Load model
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+model = AutoModelForCausalLM.from_pretrained(
+    "PaddlePaddle/PaddleOCR-VL",
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+).to(dtype=torch.bfloat16, device=DEVICE).eval()
+processor = AutoProcessor.from_pretrained("PaddlePaddle/PaddleOCR-VL", trust_remote_code=True)
+# 5. Choose task
+TASK = "ocr"        # ← change to "table" | "chart" | "formula"
 PROMPTS = {
     "ocr": "OCR:",
     "table": "Table Recognition:",
     "chart": "Chart Recognition:",
+    "formula": "Formula Recognition:",
 }
+# 6. Run inference
+messages = [{"role": "user", "content": [{"type": "image", "image": img},
+                                         {"type": "text",  "text": PROMPTS[TASK]}]}]
 inputs = processor.apply_chat_template(
     messages,
     tokenize=True,
     add_generation_prompt=True,
     return_dict=True,
+    return_tensors="pt"
 ).to(DEVICE)
+# 7. Run inference
+with torch.inference_mode():
+    out = model.generate(
+        **inputs,
+        max_new_tokens=1024,
+        do_sample=False,
+        use_cache=True
+    )
+# 8. Decode the output
+result = processor.batch_decode(out, skip_special_tokens=True)[0]
+print("\n" + "="*60 + "\nRESULT:\n" + "="*60)
+print(result)
 ```
 ## Performance