.

Files changed (4) hide show

README.md CHANGED Viewed

@@ -15,12 +15,7 @@ Kosmos-2.5 is a multimodal literate model for machine reading of text-intensive
 ## NOTE
 Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
-## How to Use?
-This repo will be soon merged to official Transformers.
-```bash
-pip install git+https://github.com/tic-top/transformers.git
-```
 ### Markdown Task
 Run with [md.py](md.py).
 ```text

 ## NOTE
 Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
+## Usage
 ### Markdown Task
 Run with [md.py](md.py).
 ```text

md.py CHANGED Viewed

@@ -12,7 +12,6 @@ processor = AutoProcessor.from_pretrained(repo)
 # sample image
 url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png"
-url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
 image = Image.open(requests.get(url, stream=True).raw)
 prompt = "<md>"

 # sample image
 url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png"
 image = Image.open(requests.get(url, stream=True).raw)
 prompt = "<md>"

ocr.py CHANGED Viewed

@@ -5,17 +5,16 @@ from PIL import Image, ImageDraw
 from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
 repo = "microsoft/kosmos-2.5"
-device = "cuda:0"
 dtype = torch.bfloat16
 model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
 processor = AutoProcessor.from_pretrained(repo)
 # sample image
 url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png"
-url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
 image = Image.open(requests.get(url, stream=True).raw)
-# singe image
 prompt = "<ocr>"
 inputs = processor(text=prompt, images=image, return_tensors="pt")
 height, width = inputs.pop("height"), inputs.pop("width")
@@ -23,12 +22,12 @@ raw_width, raw_height = image.size
 scale_height = raw_height / height
 scale_width = raw_width / width
-# batch generate
-# inputs = processor(text=[prompt, prompt], images=[image,image], return_tensors="pt")
-# height, width = inputs.pop("height"), inputs.pop("width")
-# raw_width, raw_height = image.size
-# scale_height = raw_height / height[0]
-# scale_width = raw_width / width[0]
 inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
 inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)

 from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
 repo = "microsoft/kosmos-2.5"
+device = "cuda:1"
 dtype = torch.bfloat16
 model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
 processor = AutoProcessor.from_pretrained(repo)
 # sample image
 url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png"
 image = Image.open(requests.get(url, stream=True).raw)
+# bs = 1
 prompt = "<ocr>"
 inputs = processor(text=prompt, images=image, return_tensors="pt")
 height, width = inputs.pop("height"), inputs.pop("width")
 scale_height = raw_height / height
 scale_width = raw_width / width
+# bs > 1, batch decoding sample
+inputs = processor(text=[prompt, prompt], images=[image,image], return_tensors="pt")
+height, width = inputs.pop("height"), inputs.pop("width")
+raw_width, raw_height = image.size
+scale_height = raw_height / height[0]
+scale_width = raw_width / width[0]
 inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
 inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)

output.png CHANGED Viewed