pierreguillou commited on
Commit
e225cae
·
1 Parent(s): 3ab39f3

Update files/functions.py

Browse files
Files changed (1) hide show
  1. files/functions.py +15 -2
files/functions.py CHANGED
@@ -346,7 +346,7 @@ def pdf_to_images(uploaded_pdf):
346
  except PdfReadError:
347
  path_to_file = pdf_blank
348
  filename = path_to_file.replace(examples_dir,"")
349
- msg = "invalid PDF file."
350
  images = [Image.open(image_blank)]
351
  else:
352
  try:
@@ -463,8 +463,21 @@ def prepare_inference_features(example):
463
  # we want sorted lists from top to bottom of the image
464
  boxes, texts = sort_data_wo_labels(normalize_bboxes_par, texts)
465
 
 
 
 
 
 
 
 
 
 
 
 
 
 
466
  count = 0
467
- for box, text in zip(boxes, texts):
468
  tokens = tokenizer.tokenize(text)
469
  num_tokens = len(tokens) # get number of tokens
470
  tokens_list.extend(tokens)
 
346
  except PdfReadError:
347
  path_to_file = pdf_blank
348
  filename = path_to_file.replace(examples_dir,"")
349
+ msg = "Invalid PDF file."
350
  images = [Image.open(image_blank)]
351
  else:
352
  try:
 
463
  # we want sorted lists from top to bottom of the image
464
  boxes, texts = sort_data_wo_labels(normalize_bboxes_par, texts)
465
 
466
+ bboxes_unique_list, texts_blocks = list(), list()
467
+ bbox_prev = [-100, -100, -100, -100]
468
+ for bbox, text in zip(boxes, texts):
469
+ if bbox != bbox_prev and bbox != cls_box:
470
+ bboxes_unique_list.append(bbox)
471
+ texts_block = text
472
+ else:
473
+ if bbox != cls_box:
474
+ texts_block += '\n' + text
475
+ else:
476
+ texts_blocks.append(texts_block)
477
+ bbox_prev = bbox
478
+
479
  count = 0
480
+ for box, text in zip(bboxes_unique_list, texts_blocks):
481
  tokens = tokenizer.tokenize(text)
482
  num_tokens = len(tokens) # get number of tokens
483
  tokens_list.extend(tokens)