Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files
transformers_rec/configuration.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
STANFORD_COFIGURATION = {
|
| 2 |
"DEFAULT_MODEL_PATH": "StanfordAIMI/stanford-deidentifier-base",
|
| 3 |
"PRESIDIO_SUPPORTED_ENTITIES": [
|
|
@@ -11,7 +13,8 @@ STANFORD_COFIGURATION = {
|
|
| 11 |
"DEVICE",
|
| 12 |
"ZIP",
|
| 13 |
"PROFESSION",
|
| 14 |
-
"USERNAME"
|
|
|
|
| 15 |
|
| 16 |
],
|
| 17 |
"LABELS_TO_IGNORE": ["O"],
|
|
@@ -22,8 +25,8 @@ STANFORD_COFIGURATION = {
|
|
| 22 |
"DOCTOR": "PERSON",
|
| 23 |
"PATIENT": "PERSON",
|
| 24 |
"HOSPITAL": "LOCATION",
|
| 25 |
-
"MEDICALRECORD": "
|
| 26 |
-
"IDNUM": "
|
| 27 |
"ORGANIZATION": "ORGANIZATION",
|
| 28 |
"ZIP": "ZIP",
|
| 29 |
"PHONE": "PHONE_NUMBER",
|
|
@@ -55,6 +58,8 @@ STANFORD_COFIGURATION = {
|
|
| 55 |
},
|
| 56 |
"CHUNK_OVERLAP_SIZE": 40,
|
| 57 |
"CHUNK_SIZE": 600,
|
|
|
|
|
|
|
| 58 |
}
|
| 59 |
|
| 60 |
|
|
@@ -70,6 +75,7 @@ BERT_DEID_CONFIGURATION = {
|
|
| 70 |
"ZIP",
|
| 71 |
"PROFESSION",
|
| 72 |
"USERNAME",
|
|
|
|
| 73 |
],
|
| 74 |
"DEFAULT_MODEL_PATH": "obi/deid_roberta_i2b2",
|
| 75 |
"LABELS_TO_IGNORE": ["O"],
|
|
@@ -102,7 +108,7 @@ BERT_DEID_CONFIGURATION = {
|
|
| 102 |
"LOC": "LOCATION",
|
| 103 |
"ORG": "ORGANIZATION",
|
| 104 |
"AGE": "AGE",
|
| 105 |
-
"ID": "
|
| 106 |
"EMAIL": "EMAIL",
|
| 107 |
"PATIENT": "PERSON",
|
| 108 |
"STAFF": "PERSON",
|
|
@@ -113,4 +119,6 @@ BERT_DEID_CONFIGURATION = {
|
|
| 113 |
},
|
| 114 |
"CHUNK_OVERLAP_SIZE": 40,
|
| 115 |
"CHUNK_SIZE": 600,
|
|
|
|
|
|
|
| 116 |
}
|
|
|
|
| 1 |
+
## Taken from https://github.com/microsoft/presidio/blob/main/docs/samples/python/transformers_recognizer/configuration.py
|
| 2 |
+
|
| 3 |
STANFORD_COFIGURATION = {
|
| 4 |
"DEFAULT_MODEL_PATH": "StanfordAIMI/stanford-deidentifier-base",
|
| 5 |
"PRESIDIO_SUPPORTED_ENTITIES": [
|
|
|
|
| 13 |
"DEVICE",
|
| 14 |
"ZIP",
|
| 15 |
"PROFESSION",
|
| 16 |
+
"USERNAME",
|
| 17 |
+
"ID"
|
| 18 |
|
| 19 |
],
|
| 20 |
"LABELS_TO_IGNORE": ["O"],
|
|
|
|
| 25 |
"DOCTOR": "PERSON",
|
| 26 |
"PATIENT": "PERSON",
|
| 27 |
"HOSPITAL": "LOCATION",
|
| 28 |
+
"MEDICALRECORD": "ID",
|
| 29 |
+
"IDNUM": "ID",
|
| 30 |
"ORGANIZATION": "ORGANIZATION",
|
| 31 |
"ZIP": "ZIP",
|
| 32 |
"PHONE": "PHONE_NUMBER",
|
|
|
|
| 58 |
},
|
| 59 |
"CHUNK_OVERLAP_SIZE": 40,
|
| 60 |
"CHUNK_SIZE": 600,
|
| 61 |
+
"ID_SCORE_MULTIPLIER": 0.4,
|
| 62 |
+
"ID_ENTITY_NAME": "ID"
|
| 63 |
}
|
| 64 |
|
| 65 |
|
|
|
|
| 75 |
"ZIP",
|
| 76 |
"PROFESSION",
|
| 77 |
"USERNAME",
|
| 78 |
+
"ID"
|
| 79 |
],
|
| 80 |
"DEFAULT_MODEL_PATH": "obi/deid_roberta_i2b2",
|
| 81 |
"LABELS_TO_IGNORE": ["O"],
|
|
|
|
| 108 |
"LOC": "LOCATION",
|
| 109 |
"ORG": "ORGANIZATION",
|
| 110 |
"AGE": "AGE",
|
| 111 |
+
"ID": "ID",
|
| 112 |
"EMAIL": "EMAIL",
|
| 113 |
"PATIENT": "PERSON",
|
| 114 |
"STAFF": "PERSON",
|
|
|
|
| 119 |
},
|
| 120 |
"CHUNK_OVERLAP_SIZE": 40,
|
| 121 |
"CHUNK_SIZE": 600,
|
| 122 |
+
"ID_SCORE_MULTIPLIER": 0.4,
|
| 123 |
+
"ID_ENTITY_NAME": "ID"
|
| 124 |
}
|
transformers_rec/transformers_recognizer.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
import copy
|
| 2 |
import logging
|
| 3 |
from typing import Optional, List
|
|
@@ -90,6 +92,8 @@ class TransformersRecognizer(EntityRecognizer):
|
|
| 90 |
self.default_explanation = None
|
| 91 |
self.text_overlap_length = None
|
| 92 |
self.chunk_length = None
|
|
|
|
|
|
|
| 93 |
|
| 94 |
def load_transformer(self, **kwargs) -> None:
|
| 95 |
"""Load external configuration parameters and set default values.
|
|
@@ -104,6 +108,8 @@ class TransformersRecognizer(EntityRecognizer):
|
|
| 104 |
**CHUNK_SIZE (int) - number of characters in each chunk of text
|
| 105 |
**LABELS_TO_IGNORE (List(str)) - List of entities to skip evaluation. Defaults to ["O"]
|
| 106 |
**DEFAULT_EXPLANATION (str) - string format to use for prediction explanations
|
|
|
|
|
|
|
| 107 |
"""
|
| 108 |
|
| 109 |
self.entity_mapping = kwargs.get("DATASET_TO_PRESIDIO_MAPPING", {})
|
|
@@ -113,6 +119,9 @@ class TransformersRecognizer(EntityRecognizer):
|
|
| 113 |
self.default_explanation = kwargs.get("DEFAULT_EXPLANATION", None)
|
| 114 |
self.text_overlap_length = kwargs.get("CHUNK_OVERLAP_SIZE", 40)
|
| 115 |
self.chunk_length = kwargs.get("CHUNK_SIZE", 600)
|
|
|
|
|
|
|
|
|
|
| 116 |
if not self.pipeline:
|
| 117 |
if not self.model_path:
|
| 118 |
self.model_path = "obi/deid_roberta_i2b2"
|
|
@@ -165,11 +174,14 @@ class TransformersRecognizer(EntityRecognizer):
|
|
| 165 |
ner_results = self._get_ner_results_for_text(text)
|
| 166 |
|
| 167 |
for res in ner_results:
|
| 168 |
-
|
| 169 |
-
if not
|
| 170 |
continue
|
| 171 |
|
| 172 |
-
res["entity_group"]
|
|
|
|
|
|
|
|
|
|
| 173 |
textual_explanation = self.default_explanation.format(res["entity_group"])
|
| 174 |
explanation = self.build_transformers_explanation(
|
| 175 |
float(round(res["score"], 2)), textual_explanation, res["word"]
|
|
@@ -224,33 +236,32 @@ class TransformersRecognizer(EntityRecognizer):
|
|
| 224 |
model_max_length = self.pipeline.tokenizer.model_max_length
|
| 225 |
# calculate inputs based on the text
|
| 226 |
text_length = len(text)
|
| 227 |
-
|
| 228 |
-
if text_length
|
| 229 |
-
|
|
|
|
| 230 |
logger.info(
|
| 231 |
-
f"splitting the text into chunks, length {text_length} > {model_max_length
|
| 232 |
)
|
| 233 |
-
|
| 234 |
chunk_indexes = TransformersRecognizer.split_text_to_word_chunks(
|
| 235 |
text_length, self.chunk_length, self.text_overlap_length
|
| 236 |
-
|
| 237 |
-
else:
|
| 238 |
-
chunk_indexes = [[0, text_length]]
|
| 239 |
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
|
| 253 |
-
|
| 254 |
|
| 255 |
# remove duplicates
|
| 256 |
predictions = [dict(t) for t in {tuple(d.items()) for d in predictions}]
|
|
@@ -302,27 +313,24 @@ class TransformersRecognizer(EntityRecognizer):
|
|
| 302 |
)
|
| 303 |
return explanation
|
| 304 |
|
| 305 |
-
def __check_label_transformer(self, label: str) -> str:
|
| 306 |
"""The function validates the predicted label is identified by Presidio
|
| 307 |
and maps the string into a Presidio representation
|
| 308 |
:param label: Predicted label by the model
|
| 309 |
-
:
|
| 310 |
-
:return: Returns the predicted entity if the label is found in model_to_presidio mapping dictionary
|
| 311 |
-
and is supported by Presidio entities
|
| 312 |
-
:rtype: str
|
| 313 |
"""
|
| 314 |
|
| 315 |
-
if label == "O":
|
| 316 |
-
return label
|
| 317 |
-
|
| 318 |
# convert model label to presidio label
|
| 319 |
entity = self.model_to_presidio_mapping.get(label, None)
|
| 320 |
|
|
|
|
|
|
|
|
|
|
| 321 |
if entity is None:
|
| 322 |
-
logger.warning(f"Found unrecognized label {label}, returning entity as
|
| 323 |
-
return
|
| 324 |
|
| 325 |
if entity not in self.supported_entities:
|
| 326 |
logger.warning(f"Found entity {entity} which is not supported by Presidio")
|
| 327 |
-
return
|
| 328 |
return entity
|
|
|
|
| 1 |
+
# Modified from https://github.com/microsoft/presidio/blob/main/docs/samples/python/transformers_recognizer/transformer_recognizer.py
|
| 2 |
+
|
| 3 |
import copy
|
| 4 |
import logging
|
| 5 |
from typing import Optional, List
|
|
|
|
| 92 |
self.default_explanation = None
|
| 93 |
self.text_overlap_length = None
|
| 94 |
self.chunk_length = None
|
| 95 |
+
self.id_entity_name = None
|
| 96 |
+
self.id_score_reduction = None
|
| 97 |
|
| 98 |
def load_transformer(self, **kwargs) -> None:
|
| 99 |
"""Load external configuration parameters and set default values.
|
|
|
|
| 108 |
**CHUNK_SIZE (int) - number of characters in each chunk of text
|
| 109 |
**LABELS_TO_IGNORE (List(str)) - List of entities to skip evaluation. Defaults to ["O"]
|
| 110 |
**DEFAULT_EXPLANATION (str) - string format to use for prediction explanations
|
| 111 |
+
**ID_ENTITY_NAME (str) - name of the ID entity
|
| 112 |
+
**ID_SCORE_REDUCTION (float) - score multiplier for ID entities
|
| 113 |
"""
|
| 114 |
|
| 115 |
self.entity_mapping = kwargs.get("DATASET_TO_PRESIDIO_MAPPING", {})
|
|
|
|
| 119 |
self.default_explanation = kwargs.get("DEFAULT_EXPLANATION", None)
|
| 120 |
self.text_overlap_length = kwargs.get("CHUNK_OVERLAP_SIZE", 40)
|
| 121 |
self.chunk_length = kwargs.get("CHUNK_SIZE", 600)
|
| 122 |
+
self.id_entity_name = kwargs.get("ID_ENTITY_NAME", "ID")
|
| 123 |
+
self.id_score_reduction = kwargs.get("ID_SCORE_REDUCTION", 0.5)
|
| 124 |
+
|
| 125 |
if not self.pipeline:
|
| 126 |
if not self.model_path:
|
| 127 |
self.model_path = "obi/deid_roberta_i2b2"
|
|
|
|
| 174 |
ner_results = self._get_ner_results_for_text(text)
|
| 175 |
|
| 176 |
for res in ner_results:
|
| 177 |
+
res["entity_group"] = self.__check_label_transformer(res["entity_group"])
|
| 178 |
+
if not res["entity_group"]:
|
| 179 |
continue
|
| 180 |
|
| 181 |
+
if res["entity_group"] == self.id_entity_name:
|
| 182 |
+
print(f"ID entity found, multiplying score by {self.id_score_reduction}")
|
| 183 |
+
res["score"] = res["score"] * self.id_score_reduction
|
| 184 |
+
|
| 185 |
textual_explanation = self.default_explanation.format(res["entity_group"])
|
| 186 |
explanation = self.build_transformers_explanation(
|
| 187 |
float(round(res["score"], 2)), textual_explanation, res["word"]
|
|
|
|
| 236 |
model_max_length = self.pipeline.tokenizer.model_max_length
|
| 237 |
# calculate inputs based on the text
|
| 238 |
text_length = len(text)
|
| 239 |
+
# split text into chunks
|
| 240 |
+
if text_length <= model_max_length:
|
| 241 |
+
predictions = self.pipeline(text)
|
| 242 |
+
else:
|
| 243 |
logger.info(
|
| 244 |
+
f"splitting the text into chunks, length {text_length} > {model_max_length}"
|
| 245 |
)
|
| 246 |
+
predictions = list()
|
| 247 |
chunk_indexes = TransformersRecognizer.split_text_to_word_chunks(
|
| 248 |
text_length, self.chunk_length, self.text_overlap_length
|
| 249 |
+
)
|
|
|
|
|
|
|
| 250 |
|
| 251 |
+
# iterate over text chunks and run inference
|
| 252 |
+
for chunk_start, chunk_end in chunk_indexes:
|
| 253 |
+
chunk_text = text[chunk_start:chunk_end]
|
| 254 |
+
chunk_preds = self.pipeline(chunk_text)
|
| 255 |
|
| 256 |
+
# align indexes to match the original text - add to each position the value of chunk_start
|
| 257 |
+
aligned_predictions = list()
|
| 258 |
+
for prediction in chunk_preds:
|
| 259 |
+
prediction_tmp = copy.deepcopy(prediction)
|
| 260 |
+
prediction_tmp["start"] += chunk_start
|
| 261 |
+
prediction_tmp["end"] += chunk_start
|
| 262 |
+
aligned_predictions.append(prediction_tmp)
|
| 263 |
|
| 264 |
+
predictions.extend(aligned_predictions)
|
| 265 |
|
| 266 |
# remove duplicates
|
| 267 |
predictions = [dict(t) for t in {tuple(d.items()) for d in predictions}]
|
|
|
|
| 313 |
)
|
| 314 |
return explanation
|
| 315 |
|
| 316 |
+
def __check_label_transformer(self, label: str) -> Optional[str]:
|
| 317 |
"""The function validates the predicted label is identified by Presidio
|
| 318 |
and maps the string into a Presidio representation
|
| 319 |
:param label: Predicted label by the model
|
| 320 |
+
:return: Returns the adjusted entity name
|
|
|
|
|
|
|
|
|
|
| 321 |
"""
|
| 322 |
|
|
|
|
|
|
|
|
|
|
| 323 |
# convert model label to presidio label
|
| 324 |
entity = self.model_to_presidio_mapping.get(label, None)
|
| 325 |
|
| 326 |
+
if entity in self.ignore_labels:
|
| 327 |
+
return None
|
| 328 |
+
|
| 329 |
if entity is None:
|
| 330 |
+
logger.warning(f"Found unrecognized label {label}, returning entity as is")
|
| 331 |
+
return label
|
| 332 |
|
| 333 |
if entity not in self.supported_entities:
|
| 334 |
logger.warning(f"Found entity {entity} which is not supported by Presidio")
|
| 335 |
+
return entity
|
| 336 |
return entity
|