Commit
·
e225cae
1
Parent(s):
3ab39f3
Update files/functions.py
Browse files- files/functions.py +15 -2
files/functions.py
CHANGED
@@ -346,7 +346,7 @@ def pdf_to_images(uploaded_pdf):
|
|
346 |
except PdfReadError:
|
347 |
path_to_file = pdf_blank
|
348 |
filename = path_to_file.replace(examples_dir,"")
|
349 |
-
msg = "
|
350 |
images = [Image.open(image_blank)]
|
351 |
else:
|
352 |
try:
|
@@ -463,8 +463,21 @@ def prepare_inference_features(example):
|
|
463 |
# we want sorted lists from top to bottom of the image
|
464 |
boxes, texts = sort_data_wo_labels(normalize_bboxes_par, texts)
|
465 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
466 |
count = 0
|
467 |
-
for box, text in zip(
|
468 |
tokens = tokenizer.tokenize(text)
|
469 |
num_tokens = len(tokens) # get number of tokens
|
470 |
tokens_list.extend(tokens)
|
|
|
346 |
except PdfReadError:
|
347 |
path_to_file = pdf_blank
|
348 |
filename = path_to_file.replace(examples_dir,"")
|
349 |
+
msg = "Invalid PDF file."
|
350 |
images = [Image.open(image_blank)]
|
351 |
else:
|
352 |
try:
|
|
|
463 |
# we want sorted lists from top to bottom of the image
|
464 |
boxes, texts = sort_data_wo_labels(normalize_bboxes_par, texts)
|
465 |
|
466 |
+
bboxes_unique_list, texts_blocks = list(), list()
|
467 |
+
bbox_prev = [-100, -100, -100, -100]
|
468 |
+
for bbox, text in zip(boxes, texts):
|
469 |
+
if bbox != bbox_prev and bbox != cls_box:
|
470 |
+
bboxes_unique_list.append(bbox)
|
471 |
+
texts_block = text
|
472 |
+
else:
|
473 |
+
if bbox != cls_box:
|
474 |
+
texts_block += '\n' + text
|
475 |
+
else:
|
476 |
+
texts_blocks.append(texts_block)
|
477 |
+
bbox_prev = bbox
|
478 |
+
|
479 |
count = 0
|
480 |
+
for box, text in zip(bboxes_unique_list, texts_blocks):
|
481 |
tokens = tokenizer.tokenize(text)
|
482 |
num_tokens = len(tokens) # get number of tokens
|
483 |
tokens_list.extend(tokens)
|