from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification from PIL import Image processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr") model = LayoutLMv2ForTokenClassification.from_pretrained("microsoft/layoutlmv2-base-uncased") image = Image.open("Labels-New\OriginalImages_cropped_ocr_results\IMG_1693.bmp").convert("RGB") words = ["hello", "world"] boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes word_labels = [0, 1] encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt") outputs = model(**encoding) loss = outputs.loss logits = outputs.logits