presidio_WW

Sleeping

App Files Files Community

presidio commited on May 30, 2023

Commit

b7be871

1 Parent(s): 3477655

Upload 3 files

Browse files

Files changed (2) hide show

transformers_rec/configuration.py +12 -4
transformers_rec/transformers_recognizer.py +42 -34

transformers_rec/configuration.py CHANGED Viewed

@@ -1,3 +1,5 @@
 STANFORD_COFIGURATION = {
     "DEFAULT_MODEL_PATH": "StanfordAIMI/stanford-deidentifier-base",
     "PRESIDIO_SUPPORTED_ENTITIES": [
@@ -11,7 +13,8 @@ STANFORD_COFIGURATION = {
         "DEVICE",
         "ZIP",
         "PROFESSION",
-        "USERNAME"
     ],
     "LABELS_TO_IGNORE": ["O"],
@@ -22,8 +25,8 @@ STANFORD_COFIGURATION = {
         "DOCTOR": "PERSON",
         "PATIENT": "PERSON",
         "HOSPITAL": "LOCATION",
-        "MEDICALRECORD": "O",
-        "IDNUM": "O",
         "ORGANIZATION": "ORGANIZATION",
         "ZIP": "ZIP",
         "PHONE": "PHONE_NUMBER",
@@ -55,6 +58,8 @@ STANFORD_COFIGURATION = {
     },
     "CHUNK_OVERLAP_SIZE": 40,
     "CHUNK_SIZE": 600,
 }
@@ -70,6 +75,7 @@ BERT_DEID_CONFIGURATION = {
         "ZIP",
         "PROFESSION",
         "USERNAME",
     ],
     "DEFAULT_MODEL_PATH": "obi/deid_roberta_i2b2",
     "LABELS_TO_IGNORE": ["O"],
@@ -102,7 +108,7 @@ BERT_DEID_CONFIGURATION = {
         "LOC": "LOCATION",
         "ORG": "ORGANIZATION",
         "AGE": "AGE",
-        "ID": "O",
         "EMAIL": "EMAIL",
         "PATIENT": "PERSON",
         "STAFF": "PERSON",
@@ -113,4 +119,6 @@ BERT_DEID_CONFIGURATION = {
     },
     "CHUNK_OVERLAP_SIZE": 40,
     "CHUNK_SIZE": 600,
 }

+## Taken from https://github.com/microsoft/presidio/blob/main/docs/samples/python/transformers_recognizer/configuration.py
 STANFORD_COFIGURATION = {
     "DEFAULT_MODEL_PATH": "StanfordAIMI/stanford-deidentifier-base",
     "PRESIDIO_SUPPORTED_ENTITIES": [
         "DEVICE",
         "ZIP",
         "PROFESSION",
+        "USERNAME",
+        "ID"
     ],
     "LABELS_TO_IGNORE": ["O"],
         "DOCTOR": "PERSON",
         "PATIENT": "PERSON",
         "HOSPITAL": "LOCATION",
+        "MEDICALRECORD": "ID",
+        "IDNUM": "ID",
         "ORGANIZATION": "ORGANIZATION",
         "ZIP": "ZIP",
         "PHONE": "PHONE_NUMBER",
     },
     "CHUNK_OVERLAP_SIZE": 40,
     "CHUNK_SIZE": 600,
+    "ID_SCORE_MULTIPLIER": 0.4,
+    "ID_ENTITY_NAME": "ID"
 }
         "ZIP",
         "PROFESSION",
         "USERNAME",
+        "ID"
     ],
     "DEFAULT_MODEL_PATH": "obi/deid_roberta_i2b2",
     "LABELS_TO_IGNORE": ["O"],
         "LOC": "LOCATION",
         "ORG": "ORGANIZATION",
         "AGE": "AGE",
+        "ID": "ID",
         "EMAIL": "EMAIL",
         "PATIENT": "PERSON",
         "STAFF": "PERSON",
     },
     "CHUNK_OVERLAP_SIZE": 40,
     "CHUNK_SIZE": 600,
+    "ID_SCORE_MULTIPLIER": 0.4,
+    "ID_ENTITY_NAME": "ID"
 }

transformers_rec/transformers_recognizer.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import copy
 import logging
 from typing import Optional, List
@@ -90,6 +92,8 @@ class TransformersRecognizer(EntityRecognizer):
         self.default_explanation = None
         self.text_overlap_length = None
         self.chunk_length = None
     def load_transformer(self, **kwargs) -> None:
         """Load external configuration parameters and set default values.
@@ -104,6 +108,8 @@ class TransformersRecognizer(EntityRecognizer):
         **CHUNK_SIZE (int) - number of characters in each chunk of text
         **LABELS_TO_IGNORE (List(str)) - List of entities to skip evaluation. Defaults to ["O"]
         **DEFAULT_EXPLANATION (str) - string format to use for prediction explanations
         """
         self.entity_mapping = kwargs.get("DATASET_TO_PRESIDIO_MAPPING", {})
@@ -113,6 +119,9 @@ class TransformersRecognizer(EntityRecognizer):
         self.default_explanation = kwargs.get("DEFAULT_EXPLANATION", None)
         self.text_overlap_length = kwargs.get("CHUNK_OVERLAP_SIZE", 40)
         self.chunk_length = kwargs.get("CHUNK_SIZE", 600)
         if not self.pipeline:
             if not self.model_path:
                 self.model_path = "obi/deid_roberta_i2b2"
@@ -165,11 +174,14 @@ class TransformersRecognizer(EntityRecognizer):
         ner_results = self._get_ner_results_for_text(text)
         for res in ner_results:
-            entity = self.model_to_presidio_mapping.get(res["entity_group"], None)
-            if not entity:
                 continue
-            res["entity_group"] = self.__check_label_transformer(res["entity_group"])
             textual_explanation = self.default_explanation.format(res["entity_group"])
             explanation = self.build_transformers_explanation(
                 float(round(res["score"], 2)), textual_explanation, res["word"]
@@ -224,33 +236,32 @@ class TransformersRecognizer(EntityRecognizer):
         model_max_length = self.pipeline.tokenizer.model_max_length
         # calculate inputs based on the text
         text_length = len(text)
-        predictions = list()
-        if text_length > model_max_length*2:
-            # split text into chunks
             logger.info(
-                f"splitting the text into chunks, length {text_length} > {model_max_length*2}"
             )
             chunk_indexes = TransformersRecognizer.split_text_to_word_chunks(
                 text_length, self.chunk_length, self.text_overlap_length
-            )
-        else:
-            chunk_indexes = [[0, text_length]]
-        # iterate over text chunks and run inference
-        for chunk_start, chunk_end in chunk_indexes:
-            chunk_text = text[chunk_start:chunk_end]
-            chunk_preds = self.pipeline(chunk_text)
-            # align indexes to match the original text - add to each position the value of chunk_start
-            aligned_predictions = list()
-            for prediction in chunk_preds:
-                prediction_tmp = copy.deepcopy(prediction)
-                prediction_tmp["start"] += chunk_start
-                prediction_tmp["end"] += chunk_start
-                aligned_predictions.append(prediction_tmp)
-            predictions.extend(aligned_predictions)
         # remove duplicates
         predictions = [dict(t) for t in {tuple(d.items()) for d in predictions}]
@@ -302,27 +313,24 @@ class TransformersRecognizer(EntityRecognizer):
         )
         return explanation
-    def __check_label_transformer(self, label: str) -> str:
         """The function validates the predicted label is identified by Presidio
         and maps the string into a Presidio representation
         :param label: Predicted label by the model
-        :type label: str
-        :return: Returns the predicted entity if the label is found in model_to_presidio mapping dictionary
-        and is supported by Presidio entities
-        :rtype: str
         """
-        if label == "O":
-            return label
         # convert model label to presidio label
         entity = self.model_to_presidio_mapping.get(label, None)
         if entity is None:
-            logger.warning(f"Found unrecognized label {label}, returning entity as 'O'")
-            return "O"
         if entity not in self.supported_entities:
             logger.warning(f"Found entity {entity} which is not supported by Presidio")
-            return "O"
         return entity

+# Modified from https://github.com/microsoft/presidio/blob/main/docs/samples/python/transformers_recognizer/transformer_recognizer.py
 import copy
 import logging
 from typing import Optional, List
         self.default_explanation = None
         self.text_overlap_length = None
         self.chunk_length = None
+        self.id_entity_name = None
+        self.id_score_reduction = None
     def load_transformer(self, **kwargs) -> None:
         """Load external configuration parameters and set default values.
         **CHUNK_SIZE (int) - number of characters in each chunk of text
         **LABELS_TO_IGNORE (List(str)) - List of entities to skip evaluation. Defaults to ["O"]
         **DEFAULT_EXPLANATION (str) - string format to use for prediction explanations
+        **ID_ENTITY_NAME (str) - name of the ID entity
+        **ID_SCORE_REDUCTION (float) - score multiplier for ID entities
         """
         self.entity_mapping = kwargs.get("DATASET_TO_PRESIDIO_MAPPING", {})
         self.default_explanation = kwargs.get("DEFAULT_EXPLANATION", None)
         self.text_overlap_length = kwargs.get("CHUNK_OVERLAP_SIZE", 40)
         self.chunk_length = kwargs.get("CHUNK_SIZE", 600)
+        self.id_entity_name = kwargs.get("ID_ENTITY_NAME", "ID")
+        self.id_score_reduction = kwargs.get("ID_SCORE_REDUCTION", 0.5)
         if not self.pipeline:
             if not self.model_path:
                 self.model_path = "obi/deid_roberta_i2b2"
         ner_results = self._get_ner_results_for_text(text)
         for res in ner_results:
+            res["entity_group"] = self.__check_label_transformer(res["entity_group"])
+            if not res["entity_group"]:
                 continue
+            if res["entity_group"] == self.id_entity_name:
+                print(f"ID entity found, multiplying score by {self.id_score_reduction}")
+                res["score"] = res["score"] * self.id_score_reduction
             textual_explanation = self.default_explanation.format(res["entity_group"])
             explanation = self.build_transformers_explanation(
                 float(round(res["score"], 2)), textual_explanation, res["word"]
         model_max_length = self.pipeline.tokenizer.model_max_length
         # calculate inputs based on the text
         text_length = len(text)
+        # split text into chunks
+        if text_length <= model_max_length:
+            predictions = self.pipeline(text)
+        else:
             logger.info(
+                f"splitting the text into chunks, length {text_length} > {model_max_length}"
             )
+            predictions = list()
             chunk_indexes = TransformersRecognizer.split_text_to_word_chunks(
                 text_length, self.chunk_length, self.text_overlap_length
+                )
+            # iterate over text chunks and run inference
+            for chunk_start, chunk_end in chunk_indexes:
+                chunk_text = text[chunk_start:chunk_end]
+                chunk_preds = self.pipeline(chunk_text)
+                # align indexes to match the original text - add to each position the value of chunk_start
+                aligned_predictions = list()
+                for prediction in chunk_preds:
+                    prediction_tmp = copy.deepcopy(prediction)
+                    prediction_tmp["start"] += chunk_start
+                    prediction_tmp["end"] += chunk_start
+                    aligned_predictions.append(prediction_tmp)
+                predictions.extend(aligned_predictions)
         # remove duplicates
         predictions = [dict(t) for t in {tuple(d.items()) for d in predictions}]
         )
         return explanation
+    def __check_label_transformer(self, label: str) -> Optional[str]:
         """The function validates the predicted label is identified by Presidio
         and maps the string into a Presidio representation
         :param label: Predicted label by the model
+        :return: Returns the adjusted entity name
         """
         # convert model label to presidio label
         entity = self.model_to_presidio_mapping.get(label, None)
+        if entity in self.ignore_labels:
+            return None
         if entity is None:
+            logger.warning(f"Found unrecognized label {label}, returning entity as is")
+            return label
         if entity not in self.supported_entities:
             logger.warning(f"Found entity {entity} which is not supported by Presidio")
+            return entity
         return entity