presidio_WW

Sleeping

App Files Files Community

presidio commited on Apr 16, 2023

Commit

547518c

1 Parent(s): 6f1792a

Upload 8 files

Browse files

Files changed (7) hide show

Dockerfile +1 -0
demo_text.txt +1 -1
flair_recognizer.py +189 -0
flair_test.py +27 -0
openai_fake_data_generator.py +9 -13
presidio_streamlit.py +99 -15
requirements.txt +3 -1

Dockerfile CHANGED Viewed

@@ -13,6 +13,7 @@ COPY ./requirements.txt /code/requirements.txt
 RUN pip3 install -r requirements.txt
 RUN pip3 install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
 RUN pip3 install https://huggingface.co/spacy/en_core_web_lg/resolve/main/en_core_web_lg-any-py3-none-any.whl
 EXPOSE 7860
 COPY . /code

 RUN pip3 install -r requirements.txt
 RUN pip3 install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
 RUN pip3 install https://huggingface.co/spacy/en_core_web_lg/resolve/main/en_core_web_lg-any-py3-none-any.whl
 EXPOSE 7860
 COPY . /code

demo_text.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-Here are a few examples sentences we currently support:
 Hello, my name is David Johnson and I live in Maine.
 My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.

+Here are a few example sentences we currently support:
 Hello, my name is David Johnson and I live in Maine.
 My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.

flair_recognizer.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import logging
+from typing import Optional, List, Tuple, Set
+from presidio_analyzer import (
+    RecognizerResult,
+    EntityRecognizer,
+    AnalysisExplanation,
+)
+from presidio_analyzer.nlp_engine import NlpArtifacts
+from flair.data import Sentence
+from flair.models import SequenceTagger
+logger = logging.getLogger("presidio-analyzer")
+class FlairRecognizer(EntityRecognizer):
+    """
+    Wrapper for a flair model, if needed to be used within Presidio Analyzer.
+    :example:
+    >from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
+    >flair_recognizer = FlairRecognizer()
+    >registry = RecognizerRegistry()
+    >registry.add_recognizer(flair_recognizer)
+    >analyzer = AnalyzerEngine(registry=registry)
+    >results = analyzer.analyze(
+    >    "My name is Christopher and I live in Irbid.",
+    >    language="en",
+    >    return_decision_process=True,
+    >)
+    >for result in results:
+    >    print(result)
+    >    print(result.analysis_explanation)
+    """
+    ENTITIES = [
+        "LOCATION",
+        "PERSON",
+        "ORGANIZATION",
+        # "MISCELLANEOUS"   # - There are no direct correlation with Presidio entities.
+    ]
+    DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
+    CHECK_LABEL_GROUPS = [
+        ({"LOCATION"}, {"LOC", "LOCATION"}),
+        ({"PERSON"}, {"PER", "PERSON"}),
+        ({"ORGANIZATION"}, {"ORG"}),
+        # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
+    ]
+    MODEL_LANGUAGES = {
+        "en": "flair/ner-english-large"
+    }
+    PRESIDIO_EQUIVALENCES = {
+        "PER": "PERSON",
+        "LOC": "LOCATION",
+        "ORG": "ORGANIZATION",
+        # 'MISC': 'MISCELLANEOUS'   # - Probably not PII
+    }
+    def __init__(
+        self,
+        supported_language: str = "en",
+        supported_entities: Optional[List[str]] = None,
+        check_label_groups: Optional[Tuple[Set, Set]] = None,
+        model: SequenceTagger = None,
+    ):
+        self.check_label_groups = (
+            check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
+        )
+        supported_entities = supported_entities if supported_entities else self.ENTITIES
+        self.model = (
+            model
+            if model
+            else SequenceTagger.load(self.MODEL_LANGUAGES.get(supported_language))
+        )
+        super().__init__(
+            supported_entities=supported_entities,
+            supported_language=supported_language,
+            name="Flair Analytics",
+        )
+    def load(self) -> None:
+        """Load the model, not used. Model is loaded during initialization."""
+        pass
+    def get_supported_entities(self) -> List[str]:
+        """
+        Return supported entities by this model.
+        :return: List of the supported entities.
+        """
+        return self.supported_entities
+    # Class to use Flair with Presidio as an external recognizer.
+    def analyze(
+        self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None
+    ) -> List[RecognizerResult]:
+        """
+        Analyze text using Text Analytics.
+        :param text: The text for analysis.
+        :param entities: Not working properly for this recognizer.
+        :param nlp_artifacts: Not used by this recognizer.
+        :param language: Text language. Supported languages in MODEL_LANGUAGES
+        :return: The list of Presidio RecognizerResult constructed from the recognized
+            Flair detections.
+        """
+        results = []
+        sentences = Sentence(text)
+        self.model.predict(sentences)
+        # If there are no specific list of entities, we will look for all of it.
+        if not entities:
+            entities = self.supported_entities
+        for entity in entities:
+            if entity not in self.supported_entities:
+                continue
+            for ent in sentences.get_spans("ner"):
+                if not self.__check_label(
+                    entity, ent.labels[0].value, self.check_label_groups
+                ):
+                    continue
+                textual_explanation = self.DEFAULT_EXPLANATION.format(
+                    ent.labels[0].value
+                )
+                explanation = self.build_flair_explanation(
+                    round(ent.score, 2), textual_explanation
+                )
+                flair_result = self._convert_to_recognizer_result(ent, explanation)
+                results.append(flair_result)
+        return results
+    def _convert_to_recognizer_result(self, entity, explanation) -> RecognizerResult:
+        entity_type = self.PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)
+        flair_score = round(entity.score, 2)
+        flair_results = RecognizerResult(
+            entity_type=entity_type,
+            start=entity.start_position,
+            end=entity.end_position,
+            score=flair_score,
+            analysis_explanation=explanation,
+        )
+        return flair_results
+    def build_flair_explanation(
+        self, original_score: float, explanation: str
+    ) -> AnalysisExplanation:
+        """
+        Create explanation for why this result was detected.
+        :param original_score: Score given by this recognizer
+        :param explanation: Explanation string
+        :return:
+        """
+        explanation = AnalysisExplanation(
+            recognizer=self.__class__.__name__,
+            original_score=original_score,
+            textual_explanation=explanation,
+        )
+        return explanation
+    @staticmethod
+    def __check_label(
+        entity: str, label: str, check_label_groups: Tuple[Set, Set]
+    ) -> bool:
+        return any(
+            [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
+        )

flair_test.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Import generic wrappers
+from transformers import AutoModel, AutoTokenizer
+if __name__ == "__main__":
+    from flair.data import Sentence
+    from flair.models import SequenceTagger
+    # load tagger
+    tagger = SequenceTagger.load("flair/ner-english-large")
+    # make example sentence
+    sentence = Sentence("George Washington went to Washington")
+    # predict NER tags
+    tagger.predict(sentence)
+    # print sentence
+    print(sentence)
+    # print predicted NER spans
+    print('The following NER tags are found:')
+    # iterate over entities and print
+    for entity in sentence.get_spans('ner'):
+        print(entity)

openai_fake_data_generator.py CHANGED Viewed

@@ -1,37 +1,33 @@
 import openai
-frmo typing import List
-from presidio_analyzer import RecognizerResult
-from presidio_anonymizer import AnonymizerEngine
-def set_openai_key(openai_key:string):
     """Set the OpenAI API key.
     :param openai_key: the open AI key (https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key)
     """
     openai.api_key = openai_key
-def call_completion_model(prompt:str, model:str="text-davinci-003", max_tokens:int=512) ->str:
     """Creates a request for the OpenAI Completion service and returns the response.
     :param prompt: The prompt for the completion model
     :param model: OpenAI model name
-    :param temperature: Model's temperature parameter
     """
     response = openai.Completion.create(
-        model=model,
-        prompt= prompt,
-        max_tokens=max_tokens
     )
-    return response['choices'][0].text
 def create_prompt(anonymized_text: str) -> str:
     """
     Create the prompt with instructions to GPT-3.
     :param anonymized_text: Text with placeholders instead of PII values, e.g. My name is <PERSON>.
     """

 import openai
+def set_openai_key(openai_key: str):
     """Set the OpenAI API key.
     :param openai_key: the open AI key (https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key)
     """
     openai.api_key = openai_key
+def call_completion_model(
+    prompt: str, model: str = "text-davinci-003", max_tokens: int = 512
+) -> str:
     """Creates a request for the OpenAI Completion service and returns the response.
     :param prompt: The prompt for the completion model
     :param model: OpenAI model name
+    :param max_tokens: Model's max_tokens parameter
     """
     response = openai.Completion.create(
+        model=model, prompt=prompt, max_tokens=max_tokens
     )
+    return response["choices"][0].text
 def create_prompt(anonymized_text: str) -> str:
     """
     Create the prompt with instructions to GPT-3.
     :param anonymized_text: Text with placeholders instead of PII values, e.g. My name is <PERSON>.
     """

presidio_streamlit.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """Streamlit app for Presidio."""
 from json import JSONEncoder
 from typing import List
@@ -12,13 +12,18 @@ from presidio_analyzer.nlp_engine import NlpEngineProvider
 from presidio_anonymizer import AnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig
 from transformers_rec import (
     STANFORD_COFIGURATION,
     TransformersRecognizer,
     BERT_DEID_CONFIGURATION,
 )
-from openai_fake_data_generator import *
 # Helper methods
@@ -37,15 +42,26 @@ def analyzer_engine(model_path: str):
     # Set up NLP Engine according to the model of choice
     if model_path == "en_core_web_lg":
         nlp_configuration = {
             "nlp_engine_name": "spacy",
             "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
         }
     else:
         # Using a small spaCy model + a HF NER model
         transformers_recognizer = TransformersRecognizer(model_path=model_path)
         if model_path == "StanfordAIMI/stanford-deidentifier-base":
             transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
         elif model_path == "obi/deid_roberta_i2b2":
@@ -101,6 +117,7 @@ def anonymize(text: str, analyze_results: List[RecognizerResult]):
             "from_end": False,
         }
     elif st_operator == "encrypt":
         operator_config = {"key": st_encrypt_key}
     elif st_operator == "highlight":
@@ -108,8 +125,11 @@ def anonymize(text: str, analyze_results: List[RecognizerResult]):
     else:
         operator_config = None
     if st_operator == "highlight":
         operator = "custom"
     else:
         operator = st_operator
@@ -139,17 +159,39 @@ def annotate(text: str, analyze_results: List[RecognizerResult]):
             tokens.append(text[: res.start])
         # append entity text and entity type
-        tokens.append((text[res.start: res.end], res.entity_type))
         # if another entity coming i.e. we're not at the last results element, add text up to next entity
         if i != len(results) - 1:
-            tokens.append(text[res.end: results[i + 1].start])
         # if no more entities coming, add all remaining text
         else:
-            tokens.append(text[res.end:])
     return tokens
 st.set_page_config(page_title="Presidio demo", layout="wide")
 # Sidebar
@@ -175,20 +217,35 @@ st.sidebar.markdown(
 )
 st_model = st.sidebar.selectbox(
-    "NER model",
     [
         "StanfordAIMI/stanford-deidentifier-base",
         "obi/deid_roberta_i2b2",
         "en_core_web_lg",
     ],
     index=1,
 )
 st.sidebar.markdown("> Note: Models might take some time to download. ")
 st_operator = st.sidebar.selectbox(
     "De-identification approach",
-    ["redact", "replace", "mask", "hash", "encrypt", "highlight"],
     index=1,
 )
 if st_operator == "mask":
@@ -198,19 +255,36 @@ if st_operator == "mask":
     st_mask_char = st.sidebar.text_input("Mask character", value="*", max_chars=1)
 elif st_operator == "encrypt":
     st_encrypt_key = st.sidebar.text_input("AES key", value="WmZq4t7w!z%C&F)J")
 st_threshold = st.sidebar.slider(
-    label="Acceptance threshold", min_value=0.0, max_value=1.0, value=0.35
 )
 st_return_decision_process = st.sidebar.checkbox(
-    "Add analysis explanations to findings", value=False
 )
 st_entities = st.sidebar.multiselect(
     label="Which entities to look for?",
     options=get_supported_entities(),
     default=list(get_supported_entities()),
 )
 # Main panel
@@ -242,11 +316,21 @@ st_analyze_results = analyze(
 )
 # After
-if st_operator != "highlight":
     with col2:
         st.subheader(f"Output")
         st_anonymize_results = anonymize(st_text, st_analyze_results)
         st.text_area(label="De-identified", value=st_anonymize_results.text, height=400)
 else:
     st.subheader("Highlighted")
     annotated_tokens = annotate(st_text, st_analyze_results)
@@ -269,7 +353,7 @@ st.subheader(
 )
 if st_analyze_results:
     df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
-    df["text"] = [st_text[res.start: res.end] for res in st_analyze_results]
     df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
         {
@@ -281,7 +365,7 @@ if st_analyze_results:
         },
         axis=1,
     )
-    df_subset["Text"] = [st_text[res.start: res.end] for res in st_analyze_results]
     if st_return_decision_process:
         analysis_explanation_df = pd.DataFrame.from_records(
             [r.analysis_explanation.to_dict() for r in st_analyze_results]

 """Streamlit app for Presidio."""
+import os
 from json import JSONEncoder
 from typing import List
 from presidio_anonymizer import AnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig
+from flair_recognizer import FlairRecognizer
 from transformers_rec import (
     STANFORD_COFIGURATION,
     TransformersRecognizer,
     BERT_DEID_CONFIGURATION,
 )
+from openai_fake_data_generator import (
+    set_openai_key,
+    call_completion_model,
+    create_prompt,
+)
 # Helper methods
     # Set up NLP Engine according to the model of choice
     if model_path == "en_core_web_lg":
+        if not spacy.util.is_package("en_core_web_lg"):
+            spacy.cli.download("en_core_web_lg")
         nlp_configuration = {
             "nlp_engine_name": "spacy",
             "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
         }
+    elif model_path == "flair/ner-english-large":
+        flair_recognizer = FlairRecognizer()
+        nlp_configuration = {
+            "nlp_engine_name": "spacy",
+            "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
+        }
+        registry.add_recognizer(flair_recognizer)
+        registry.remove_recognizer("SpacyRecognizer")
     else:
+        if not spacy.util.is_package("en_core_web_sm"):
+            spacy.cli.download("en_core_web_sm")
         # Using a small spaCy model + a HF NER model
         transformers_recognizer = TransformersRecognizer(model_path=model_path)
+        registry.remove_recognizer("SpacyRecognizer")
         if model_path == "StanfordAIMI/stanford-deidentifier-base":
             transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
         elif model_path == "obi/deid_roberta_i2b2":
             "from_end": False,
         }
+    # Define operator config
     elif st_operator == "encrypt":
         operator_config = {"key": st_encrypt_key}
     elif st_operator == "highlight":
     else:
         operator_config = None
+    # Change operator if needed as intermediate step
     if st_operator == "highlight":
         operator = "custom"
+    elif st_operator == "synthesize":
+        operator = "replace"
     else:
         operator = st_operator
             tokens.append(text[: res.start])
         # append entity text and entity type
+        tokens.append((text[res.start : res.end], res.entity_type))
         # if another entity coming i.e. we're not at the last results element, add text up to next entity
         if i != len(results) - 1:
+            tokens.append(text[res.end : results[i + 1].start])
         # if no more entities coming, add all remaining text
         else:
+            tokens.append(text[res.end :])
     return tokens
+def create_fake_data(
+    text: str,
+    analyze_results: List[RecognizerResult],
+    openai_key: str,
+    openai_model_name: str,
+):
+    """Creates a synthetic version of the text using OpenAI APIs"""
+    if not openai_key:
+        return "Please provide your OpenAI key"
+    results = anonymize(text, analyze_results)
+    set_openai_key(openai_key)
+    prompt = create_prompt(results.text)
+    fake = call_openai_api(prompt, openai_model_name)
+    return fake
+@st.cache_data
+def call_openai_api(prompt: str, openai_model_name: str) -> str:
+    fake_data = call_completion_model(prompt, model=openai_model_name)
+    return fake_data
 st.set_page_config(page_title="Presidio demo", layout="wide")
 # Sidebar
 )
 st_model = st.sidebar.selectbox(
+    "NER model for PII detection",
     [
         "StanfordAIMI/stanford-deidentifier-base",
         "obi/deid_roberta_i2b2",
+        "flair/ner-english-large",
         "en_core_web_lg",
     ],
     index=1,
+    help="""
+    Select which Named Entity Recognition (NER) model to use for PII detection, in parallel to rule-based recognizers.
+    Presidio supports multiple NER packages off-the-shelf, such as spaCy, Huggingface, Stanza and Flair.
+    """,
 )
 st.sidebar.markdown("> Note: Models might take some time to download. ")
 st_operator = st.sidebar.selectbox(
     "De-identification approach",
+    ["redact", "replace", "synthesize", "highlight", "mask", "hash", "encrypt"],
     index=1,
+    help="""
+    Select which manipulation to the text is requested after PII has been identified.\n
+    - Redact: Completely remove the PII text\n
+    - Replace: Replace the PII text with a constant, e.g. <PERSON>\n
+    - Synthesize: Replace with fake values (requires an OpenAI key)\n
+    - Highlight: Shows the original text with PII highlighted in colors\n
+    - Mask: Replaces a requested number of characters with an asterisk (or other mask character)\n
+    - Hash: Replaces with the hash of the PII string\n
+    - Encrypt: Replaces with an AES encryption of the PII string, allowing the process to be reversed
+         """,
 )
 if st_operator == "mask":
     st_mask_char = st.sidebar.text_input("Mask character", value="*", max_chars=1)
 elif st_operator == "encrypt":
     st_encrypt_key = st.sidebar.text_input("AES key", value="WmZq4t7w!z%C&F)J")
+elif st_operator == "synthesize":
+    st_openai_key = st.sidebar.text_input(
+        "OPENAI_KEY",
+        value=os.getenv("OPENAI_KEY", default=""),
+        help="See https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key for more info.",
+        type="password",
+    )
+    st_openai_model = st.sidebar.text_input(
+        "OpenAI model for text synthesis",
+        value="text-davinci-003",
+        help="See more here: https://platform.openai.com/docs/models/",
+    )
 st_threshold = st.sidebar.slider(
+    label="Acceptance threshold",
+    min_value=0.0,
+    max_value=1.0,
+    value=0.35,
+    help="Define the threshold for accepting a detection as PII. See more here: ",
 )
 st_return_decision_process = st.sidebar.checkbox(
+    "Add analysis explanations to findings", value=False,
+    help="Add the decision process to the output table. More information can be found here: https://microsoft.github.io/presidio/analyzer/decision_process/"
 )
 st_entities = st.sidebar.multiselect(
     label="Which entities to look for?",
     options=get_supported_entities(),
     default=list(get_supported_entities()),
+    help="Limit the list of PII entities detected. This list is dynamic and based on the NER model and registered recognizers. More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/"
 )
 # Main panel
 )
 # After
+if st_operator not in ("highlight", "synthesize"):
     with col2:
         st.subheader(f"Output")
         st_anonymize_results = anonymize(st_text, st_analyze_results)
         st.text_area(label="De-identified", value=st_anonymize_results.text, height=400)
+elif st_operator == "synthesize":
+    with col2:
+        st.subheader(f"OpenAI Generated output")
+        fake_data = create_fake_data(
+            st_text,
+            st_analyze_results,
+            openai_key=st_openai_key,
+            openai_model_name=st_openai_model,
+        )
+        st.text_area(label="Synthetic data", value=fake_data, height=400)
 else:
     st.subheader("Highlighted")
     annotated_tokens = annotate(st_text, st_analyze_results)
 )
 if st_analyze_results:
     df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
+    df["text"] = [st_text[res.start : res.end] for res in st_analyze_results]
     df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
         {
         },
         axis=1,
     )
+    df_subset["Text"] = [st_text[res.start : res.end] for res in st_analyze_results]
     if st_return_decision_process:
         analysis_explanation_df = pd.DataFrame.from_records(
             [r.analysis_explanation.to_dict() for r in st_analyze_results]

requirements.txt CHANGED Viewed

@@ -4,4 +4,6 @@ streamlit
 pandas
 st-annotated-text
 torch
-transformers

 pandas
 st-annotated-text
 torch
+transformers
+flair
+openai