pierreguillou commited on
Commit
c184a3c
·
1 Parent(s): 17d6209

Update files/functions.py

Browse files
Files changed (1) hide show
  1. files/functions.py +12 -12
files/functions.py CHANGED
@@ -416,7 +416,7 @@ def extraction_data_from_image(images):
416
  print(f"There was an error within the extraction of PDF text by the OCR!")
417
  else:
418
  from datasets import Dataset
419
- dataset = Dataset.from_dict({"images_ids": images_ids_list, "images": images_list, "page_no": page_no_list, "num_pages": num_pages_list, "texts": lines_list, "bboxes_line": line_boxes_list})
420
 
421
  print(f"The text data was successfully extracted by the OCR!")
422
 
@@ -433,7 +433,7 @@ def prepare_inference_features(example):
433
  # batch_page_hash = example["page_hash"]
434
  batch_images_ids = example["images_ids"]
435
  batch_images = example["images"]
436
- batch_bboxes_line = example["bboxes_line"]
437
  batch_texts = example["texts"]
438
  batch_images_size = [image.size for image in batch_images]
439
 
@@ -443,12 +443,12 @@ def prepare_inference_features(example):
443
  if not isinstance(batch_images_ids, list):
444
  batch_images_ids = [batch_images_ids]
445
  batch_images = [batch_images]
446
- batch_bboxes_line = [batch_bboxes_line]
447
  batch_texts = [batch_texts]
448
  batch_width, batch_height = [batch_width], [batch_height]
449
 
450
  # process all images of the batch
451
- for num_batch, (image_id, boxes, texts, width, height) in enumerate(zip(batch_images_ids, batch_bboxes_line, batch_texts, batch_width, batch_height)):
452
  tokens_list = []
453
  bboxes_list = []
454
 
@@ -457,11 +457,11 @@ def prepare_inference_features(example):
457
  texts, boxes = [texts], [boxes]
458
 
459
  # convert boxes to original
460
- normalize_bboxes_line = [normalize_box(upperleft_to_lowerright(box), width, height) for box in boxes]
461
 
462
  # sort boxes with texts
463
  # we want sorted lists from top to bottom of the image
464
- boxes, texts = sort_data_wo_labels(normalize_bboxes_line, texts)
465
 
466
  count = 0
467
  for box, text in zip(boxes, texts):
@@ -593,8 +593,8 @@ def predictions_token_level(images, custom_encoded_dataset):
593
 
594
  from functools import reduce
595
 
596
- # Get predictions (line level)
597
- def predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes):
598
 
599
  ten_probs_dict, ten_input_ids_dict, ten_bboxes_dict = dict(), dict(), dict()
600
  bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = dict(), dict(), dict(), dict()
@@ -688,7 +688,7 @@ def predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_i
688
  else:
689
  print("An error occurred while getting predictions!")
690
 
691
- # Get labeled images with lines bounding boxes
692
  def get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict):
693
 
694
  labeled_images = list()
@@ -763,7 +763,7 @@ def get_encoded_chunk_inference(index_chunk=None):
763
  del input_ids_dict[str(bboxes_list[-1])]
764
  bboxes_list = bboxes_list[:-1]
765
 
766
- # get texts by line
767
  input_ids_list = input_ids_dict.values()
768
  texts_list = [tokenizer.decode(input_ids) for input_ids in input_ids_list]
769
 
@@ -773,7 +773,7 @@ def get_encoded_chunk_inference(index_chunk=None):
773
  return image, df, num_tokens, page_no, num_pages
774
 
775
  # display chunk of PDF image and its data
776
- def display_chunk_lines_inference(index_chunk=None):
777
 
778
  # get image and image data
779
  image, df, num_tokens, page_no, num_pages = get_encoded_chunk_inference(index_chunk=index_chunk)
@@ -786,7 +786,7 @@ def display_chunk_lines_inference(index_chunk=None):
786
  print(f'Chunk ({num_tokens} tokens) of the PDF (page: {page_no+1} / {num_pages})\n')
787
 
788
  # display image with bounding boxes
789
- print(">> PDF image with bounding boxes of lines\n")
790
  draw = ImageDraw.Draw(image)
791
 
792
  labels = list()
 
416
  print(f"There was an error within the extraction of PDF text by the OCR!")
417
  else:
418
  from datasets import Dataset
419
+ dataset = Dataset.from_dict({"images_ids": images_ids_list, "images": images_list, "page_no": page_no_list, "num_pages": num_pages_list, "texts": lines_list, "bboxes_par": par_boxes_list})
420
 
421
  print(f"The text data was successfully extracted by the OCR!")
422
 
 
433
  # batch_page_hash = example["page_hash"]
434
  batch_images_ids = example["images_ids"]
435
  batch_images = example["images"]
436
+ batch_bboxes_par = example["bboxes_par"]
437
  batch_texts = example["texts"]
438
  batch_images_size = [image.size for image in batch_images]
439
 
 
443
  if not isinstance(batch_images_ids, list):
444
  batch_images_ids = [batch_images_ids]
445
  batch_images = [batch_images]
446
+ batch_bboxes_par = [batch_bboxes_par]
447
  batch_texts = [batch_texts]
448
  batch_width, batch_height = [batch_width], [batch_height]
449
 
450
  # process all images of the batch
451
+ for num_batch, (image_id, boxes, texts, width, height) in enumerate(zip(batch_images_ids, batch_bboxes_par, batch_texts, batch_width, batch_height)):
452
  tokens_list = []
453
  bboxes_list = []
454
 
 
457
  texts, boxes = [texts], [boxes]
458
 
459
  # convert boxes to original
460
+ normalize_bboxes_par = [normalize_box(upperleft_to_lowerright(box), width, height) for box in boxes]
461
 
462
  # sort boxes with texts
463
  # we want sorted lists from top to bottom of the image
464
+ boxes, texts = sort_data_wo_labels(normalize_bboxes_par, texts)
465
 
466
  count = 0
467
  for box, text in zip(boxes, texts):
 
593
 
594
  from functools import reduce
595
 
596
+ # Get predictions (par level)
597
+ def predictions_par_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes):
598
 
599
  ten_probs_dict, ten_input_ids_dict, ten_bboxes_dict = dict(), dict(), dict()
600
  bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = dict(), dict(), dict(), dict()
 
688
  else:
689
  print("An error occurred while getting predictions!")
690
 
691
+ # Get labeled images with paragraphs bounding boxes
692
  def get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict):
693
 
694
  labeled_images = list()
 
763
  del input_ids_dict[str(bboxes_list[-1])]
764
  bboxes_list = bboxes_list[:-1]
765
 
766
+ # get texts by paragraph
767
  input_ids_list = input_ids_dict.values()
768
  texts_list = [tokenizer.decode(input_ids) for input_ids in input_ids_list]
769
 
 
773
  return image, df, num_tokens, page_no, num_pages
774
 
775
  # display chunk of PDF image and its data
776
+ def display_chunk_paragraphs_inference(index_chunk=None):
777
 
778
  # get image and image data
779
  image, df, num_tokens, page_no, num_pages = get_encoded_chunk_inference(index_chunk=index_chunk)
 
786
  print(f'Chunk ({num_tokens} tokens) of the PDF (page: {page_no+1} / {num_pages})\n')
787
 
788
  # display image with bounding boxes
789
+ print(">> PDF image with bounding boxes of paragraphs\n")
790
  draw = ImageDraw.Draw(image)
791
 
792
  labels = list()