Commit
·
c184a3c
1
Parent(s):
17d6209
Update files/functions.py
Browse files- files/functions.py +12 -12
files/functions.py
CHANGED
@@ -416,7 +416,7 @@ def extraction_data_from_image(images):
|
|
416 |
print(f"There was an error within the extraction of PDF text by the OCR!")
|
417 |
else:
|
418 |
from datasets import Dataset
|
419 |
-
dataset = Dataset.from_dict({"images_ids": images_ids_list, "images": images_list, "page_no": page_no_list, "num_pages": num_pages_list, "texts": lines_list, "
|
420 |
|
421 |
print(f"The text data was successfully extracted by the OCR!")
|
422 |
|
@@ -433,7 +433,7 @@ def prepare_inference_features(example):
|
|
433 |
# batch_page_hash = example["page_hash"]
|
434 |
batch_images_ids = example["images_ids"]
|
435 |
batch_images = example["images"]
|
436 |
-
|
437 |
batch_texts = example["texts"]
|
438 |
batch_images_size = [image.size for image in batch_images]
|
439 |
|
@@ -443,12 +443,12 @@ def prepare_inference_features(example):
|
|
443 |
if not isinstance(batch_images_ids, list):
|
444 |
batch_images_ids = [batch_images_ids]
|
445 |
batch_images = [batch_images]
|
446 |
-
|
447 |
batch_texts = [batch_texts]
|
448 |
batch_width, batch_height = [batch_width], [batch_height]
|
449 |
|
450 |
# process all images of the batch
|
451 |
-
for num_batch, (image_id, boxes, texts, width, height) in enumerate(zip(batch_images_ids,
|
452 |
tokens_list = []
|
453 |
bboxes_list = []
|
454 |
|
@@ -457,11 +457,11 @@ def prepare_inference_features(example):
|
|
457 |
texts, boxes = [texts], [boxes]
|
458 |
|
459 |
# convert boxes to original
|
460 |
-
|
461 |
|
462 |
# sort boxes with texts
|
463 |
# we want sorted lists from top to bottom of the image
|
464 |
-
boxes, texts = sort_data_wo_labels(
|
465 |
|
466 |
count = 0
|
467 |
for box, text in zip(boxes, texts):
|
@@ -593,8 +593,8 @@ def predictions_token_level(images, custom_encoded_dataset):
|
|
593 |
|
594 |
from functools import reduce
|
595 |
|
596 |
-
# Get predictions (
|
597 |
-
def
|
598 |
|
599 |
ten_probs_dict, ten_input_ids_dict, ten_bboxes_dict = dict(), dict(), dict()
|
600 |
bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = dict(), dict(), dict(), dict()
|
@@ -688,7 +688,7 @@ def predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_i
|
|
688 |
else:
|
689 |
print("An error occurred while getting predictions!")
|
690 |
|
691 |
-
# Get labeled images with
|
692 |
def get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict):
|
693 |
|
694 |
labeled_images = list()
|
@@ -763,7 +763,7 @@ def get_encoded_chunk_inference(index_chunk=None):
|
|
763 |
del input_ids_dict[str(bboxes_list[-1])]
|
764 |
bboxes_list = bboxes_list[:-1]
|
765 |
|
766 |
-
# get texts by
|
767 |
input_ids_list = input_ids_dict.values()
|
768 |
texts_list = [tokenizer.decode(input_ids) for input_ids in input_ids_list]
|
769 |
|
@@ -773,7 +773,7 @@ def get_encoded_chunk_inference(index_chunk=None):
|
|
773 |
return image, df, num_tokens, page_no, num_pages
|
774 |
|
775 |
# display chunk of PDF image and its data
|
776 |
-
def
|
777 |
|
778 |
# get image and image data
|
779 |
image, df, num_tokens, page_no, num_pages = get_encoded_chunk_inference(index_chunk=index_chunk)
|
@@ -786,7 +786,7 @@ def display_chunk_lines_inference(index_chunk=None):
|
|
786 |
print(f'Chunk ({num_tokens} tokens) of the PDF (page: {page_no+1} / {num_pages})\n')
|
787 |
|
788 |
# display image with bounding boxes
|
789 |
-
print(">> PDF image with bounding boxes of
|
790 |
draw = ImageDraw.Draw(image)
|
791 |
|
792 |
labels = list()
|
|
|
416 |
print(f"There was an error within the extraction of PDF text by the OCR!")
|
417 |
else:
|
418 |
from datasets import Dataset
|
419 |
+
dataset = Dataset.from_dict({"images_ids": images_ids_list, "images": images_list, "page_no": page_no_list, "num_pages": num_pages_list, "texts": lines_list, "bboxes_par": par_boxes_list})
|
420 |
|
421 |
print(f"The text data was successfully extracted by the OCR!")
|
422 |
|
|
|
433 |
# batch_page_hash = example["page_hash"]
|
434 |
batch_images_ids = example["images_ids"]
|
435 |
batch_images = example["images"]
|
436 |
+
batch_bboxes_par = example["bboxes_par"]
|
437 |
batch_texts = example["texts"]
|
438 |
batch_images_size = [image.size for image in batch_images]
|
439 |
|
|
|
443 |
if not isinstance(batch_images_ids, list):
|
444 |
batch_images_ids = [batch_images_ids]
|
445 |
batch_images = [batch_images]
|
446 |
+
batch_bboxes_par = [batch_bboxes_par]
|
447 |
batch_texts = [batch_texts]
|
448 |
batch_width, batch_height = [batch_width], [batch_height]
|
449 |
|
450 |
# process all images of the batch
|
451 |
+
for num_batch, (image_id, boxes, texts, width, height) in enumerate(zip(batch_images_ids, batch_bboxes_par, batch_texts, batch_width, batch_height)):
|
452 |
tokens_list = []
|
453 |
bboxes_list = []
|
454 |
|
|
|
457 |
texts, boxes = [texts], [boxes]
|
458 |
|
459 |
# convert boxes to original
|
460 |
+
normalize_bboxes_par = [normalize_box(upperleft_to_lowerright(box), width, height) for box in boxes]
|
461 |
|
462 |
# sort boxes with texts
|
463 |
# we want sorted lists from top to bottom of the image
|
464 |
+
boxes, texts = sort_data_wo_labels(normalize_bboxes_par, texts)
|
465 |
|
466 |
count = 0
|
467 |
for box, text in zip(boxes, texts):
|
|
|
593 |
|
594 |
from functools import reduce
|
595 |
|
596 |
+
# Get predictions (par level)
|
597 |
+
def predictions_par_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes):
|
598 |
|
599 |
ten_probs_dict, ten_input_ids_dict, ten_bboxes_dict = dict(), dict(), dict()
|
600 |
bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = dict(), dict(), dict(), dict()
|
|
|
688 |
else:
|
689 |
print("An error occurred while getting predictions!")
|
690 |
|
691 |
+
# Get labeled images with paragraphs bounding boxes
|
692 |
def get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict):
|
693 |
|
694 |
labeled_images = list()
|
|
|
763 |
del input_ids_dict[str(bboxes_list[-1])]
|
764 |
bboxes_list = bboxes_list[:-1]
|
765 |
|
766 |
+
# get texts by paragraph
|
767 |
input_ids_list = input_ids_dict.values()
|
768 |
texts_list = [tokenizer.decode(input_ids) for input_ids in input_ids_list]
|
769 |
|
|
|
773 |
return image, df, num_tokens, page_no, num_pages
|
774 |
|
775 |
# display chunk of PDF image and its data
|
776 |
+
def display_chunk_paragraphs_inference(index_chunk=None):
|
777 |
|
778 |
# get image and image data
|
779 |
image, df, num_tokens, page_no, num_pages = get_encoded_chunk_inference(index_chunk=index_chunk)
|
|
|
786 |
print(f'Chunk ({num_tokens} tokens) of the PDF (page: {page_no+1} / {num_pages})\n')
|
787 |
|
788 |
# display image with bounding boxes
|
789 |
+
print(">> PDF image with bounding boxes of paragraphs\n")
|
790 |
draw = ImageDraw.Draw(image)
|
791 |
|
792 |
labels = list()
|