Commit
·
c9b4e80
1
Parent(s):
70bfccd
Update files/functions.py
Browse files- files/functions.py +36 -0
files/functions.py
CHANGED
@@ -147,6 +147,42 @@ for lang_t, langcode_t in zip(langs_t,langscode_t):
|
|
147 |
langdetect2Tesseract = {v:k for k,v in Tesseract2langdetect.items()}
|
148 |
|
149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
# General
|
151 |
|
152 |
# get text and bounding boxes from an image
|
|
|
147 |
langdetect2Tesseract = {v:k for k,v in Tesseract2langdetect.items()}
|
148 |
|
149 |
|
150 |
+
## model / feature extractor / tokenizer
|
151 |
+
|
152 |
+
# get device
|
153 |
+
import torch
|
154 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
155 |
+
|
156 |
+
## model LiLT
|
157 |
+
import transformers
|
158 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
159 |
+
tokenizer_lilt = AutoTokenizer.from_pretrained(model_id_lilt)
|
160 |
+
model_lilt = AutoModelForTokenClassification.from_pretrained(model_id_lilt);
|
161 |
+
model_lilt.to(device);
|
162 |
+
|
163 |
+
## model LayoutXLM
|
164 |
+
from transformers import LayoutLMv2ForTokenClassification # LayoutXLMTokenizerFast,
|
165 |
+
model_layoutxlm = LayoutLMv2ForTokenClassification.from_pretrained(model_id_layoutxlm);
|
166 |
+
model_layoutxlm.to(device);
|
167 |
+
|
168 |
+
# feature extractor
|
169 |
+
from transformers import LayoutLMv2FeatureExtractor
|
170 |
+
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
|
171 |
+
|
172 |
+
# tokenizer
|
173 |
+
from transformers import AutoTokenizer
|
174 |
+
tokenizer_layoutxlm = AutoTokenizer.from_pretrained(tokenizer_id_layoutxlm)
|
175 |
+
|
176 |
+
# get labels
|
177 |
+
id2label_lilt = model_lilt.config.id2label
|
178 |
+
label2id_lilt = model_lilt.config.label2id
|
179 |
+
num_labels_lilt = len(id2label_lilt)
|
180 |
+
|
181 |
+
id2label_layoutxlm = model_layoutxlm.config.id2label
|
182 |
+
label2id_layoutxlm = model_layoutxlm.config.label2id
|
183 |
+
num_labels_layoutxlm = len(id2label_layoutxlm)
|
184 |
+
|
185 |
+
|
186 |
# General
|
187 |
|
188 |
# get text and bounding boxes from an image
|