Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	Upload 12 files (#2)
Browse files- Upload 12 files (17f243fd769fde0cdd472eb6975d3f2d8e55c8f7)
- azure_ai_language_wrapper.py +126 -0
- flair_recognizer.py +5 -5
- flair_test.py +25 -0
- index.md +15 -5
- openai_fake_data_generator.py +28 -33
- presidio_helpers.py +11 -14
- presidio_nlp_engine_config.py +118 -40
- presidio_streamlit.py +49 -22
- requirements.txt +2 -3
- test_streamlit.py +43 -0
    	
        azure_ai_language_wrapper.py
    ADDED
    
    | @@ -0,0 +1,126 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import os
         | 
| 2 | 
            +
            from typing import List, Optional
         | 
| 3 | 
            +
            import logging
         | 
| 4 | 
            +
            import dotenv
         | 
| 5 | 
            +
            from azure.ai.textanalytics import TextAnalyticsClient
         | 
| 6 | 
            +
            from azure.core.credentials import AzureKeyCredential
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation
         | 
| 9 | 
            +
            from presidio_analyzer.nlp_engine import NlpArtifacts
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            logger = logging.getLogger("presidio-streamlit")
         | 
| 12 | 
            +
             | 
| 13 | 
            +
             | 
| 14 | 
            +
            class AzureAIServiceWrapper(EntityRecognizer):
         | 
| 15 | 
            +
                from azure.ai.textanalytics._models import PiiEntityCategory
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                def __init__(
         | 
| 20 | 
            +
                    self,
         | 
| 21 | 
            +
                    supported_entities: Optional[List[str]] = None,
         | 
| 22 | 
            +
                    supported_language: str = "en",
         | 
| 23 | 
            +
                    ta_client: Optional[TextAnalyticsClient] = None,
         | 
| 24 | 
            +
                    ta_key: Optional[str] = None,
         | 
| 25 | 
            +
                    ta_endpoint: Optional[str] = None,
         | 
| 26 | 
            +
                ):
         | 
| 27 | 
            +
                    """
         | 
| 28 | 
            +
                    Wrapper for the Azure Text Analytics client
         | 
| 29 | 
            +
                    :param ta_client: object of type TextAnalyticsClient
         | 
| 30 | 
            +
                    :param ta_key: Azure cognitive Services for Language key
         | 
| 31 | 
            +
                    :param ta_endpoint: Azure cognitive Services for Language endpoint
         | 
| 32 | 
            +
                    """
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                    if not supported_entities:
         | 
| 35 | 
            +
                        supported_entities = self.TA_SUPPORTED_ENTITIES
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                    super().__init__(
         | 
| 38 | 
            +
                        supported_entities=supported_entities,
         | 
| 39 | 
            +
                        supported_language=supported_language,
         | 
| 40 | 
            +
                        name="Azure AI Language PII",
         | 
| 41 | 
            +
                    )
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                    self.ta_key = ta_key
         | 
| 44 | 
            +
                    self.ta_endpoint = ta_endpoint
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                    if not ta_client:
         | 
| 47 | 
            +
                        ta_client = self.__authenticate_client(ta_key, ta_endpoint)
         | 
| 48 | 
            +
                    self.ta_client = ta_client
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                @staticmethod
         | 
| 51 | 
            +
                def __authenticate_client(key: str, endpoint: str):
         | 
| 52 | 
            +
                    ta_credential = AzureKeyCredential(key)
         | 
| 53 | 
            +
                    text_analytics_client = TextAnalyticsClient(
         | 
| 54 | 
            +
                        endpoint=endpoint, credential=ta_credential
         | 
| 55 | 
            +
                    )
         | 
| 56 | 
            +
                    return text_analytics_client
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                def analyze(
         | 
| 59 | 
            +
                    self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None
         | 
| 60 | 
            +
                ) -> List[RecognizerResult]:
         | 
| 61 | 
            +
                    if not entities:
         | 
| 62 | 
            +
                        entities = []
         | 
| 63 | 
            +
                    response = self.ta_client.recognize_pii_entities(
         | 
| 64 | 
            +
                        [text], language=self.supported_language
         | 
| 65 | 
            +
                    )
         | 
| 66 | 
            +
                    results = [doc for doc in response if not doc.is_error]
         | 
| 67 | 
            +
                    recognizer_results = []
         | 
| 68 | 
            +
                    for res in results:
         | 
| 69 | 
            +
                        for entity in res.entities:
         | 
| 70 | 
            +
                            if entity.category not in self.supported_entities:
         | 
| 71 | 
            +
                                continue
         | 
| 72 | 
            +
                            analysis_explanation = AzureAIServiceWrapper._build_explanation(
         | 
| 73 | 
            +
                                original_score=entity.confidence_score,
         | 
| 74 | 
            +
                                entity_type=entity.category,
         | 
| 75 | 
            +
                            )
         | 
| 76 | 
            +
                            recognizer_results.append(
         | 
| 77 | 
            +
                                RecognizerResult(
         | 
| 78 | 
            +
                                    entity_type=entity.category,
         | 
| 79 | 
            +
                                    start=entity.offset,
         | 
| 80 | 
            +
                                    end=entity.offset + len(entity.text),
         | 
| 81 | 
            +
                                    score=entity.confidence_score,
         | 
| 82 | 
            +
                                    analysis_explanation=analysis_explanation,
         | 
| 83 | 
            +
                                )
         | 
| 84 | 
            +
                            )
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                    return recognizer_results
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                @staticmethod
         | 
| 89 | 
            +
                def _build_explanation(
         | 
| 90 | 
            +
                    original_score: float, entity_type: str
         | 
| 91 | 
            +
                ) -> AnalysisExplanation:
         | 
| 92 | 
            +
                    explanation = AnalysisExplanation(
         | 
| 93 | 
            +
                        recognizer=AzureAIServiceWrapper.__class__.__name__,
         | 
| 94 | 
            +
                        original_score=original_score,
         | 
| 95 | 
            +
                        textual_explanation=f"Identified as {entity_type} by Text Analytics",
         | 
| 96 | 
            +
                    )
         | 
| 97 | 
            +
                    return explanation
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                def load(self) -> None:
         | 
| 100 | 
            +
                    pass
         | 
| 101 | 
            +
             | 
| 102 | 
            +
             | 
| 103 | 
            +
            if __name__ == "__main__":
         | 
| 104 | 
            +
                import presidio_helpers
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                dotenv.load_dotenv()
         | 
| 107 | 
            +
                text = """
         | 
| 108 | 
            +
                Here are a few example sentences we currently support:
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                Hello, my name is David Johnson and I live in Maine.
         | 
| 111 | 
            +
                My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
         | 
| 112 | 
            +
                
         | 
| 113 | 
            +
                On September 18 I visited microsoft.com and sent an email to [email protected],  from the IP 192.168.0.1.
         | 
| 114 | 
            +
                
         | 
| 115 | 
            +
                My passport: 191280342 and my phone number: (212) 555-1234.
         | 
| 116 | 
            +
                
         | 
| 117 | 
            +
                This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
         | 
| 118 | 
            +
                
         | 
| 119 | 
            +
                Kate's social security number is 078-05-1126.  Her driver license? it is 1234567A.
         | 
| 120 | 
            +
                """
         | 
| 121 | 
            +
                analyzer = presidio_helpers.analyzer_engine(
         | 
| 122 | 
            +
                    model_path="Azure Text Analytics PII",
         | 
| 123 | 
            +
                    ta_key=os.environ["TA_KEY"],
         | 
| 124 | 
            +
                    ta_endpoint=os.environ["TA_ENDPOINT"],
         | 
| 125 | 
            +
                )
         | 
| 126 | 
            +
                analyzer.analyze(text=text, language="en")
         | 
    	
        flair_recognizer.py
    CHANGED
    
    | @@ -59,9 +59,7 @@ class FlairRecognizer(EntityRecognizer): | |
| 59 | 
             
                    # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
         | 
| 60 | 
             
                ]
         | 
| 61 |  | 
| 62 | 
            -
                MODEL_LANGUAGES = {
         | 
| 63 | 
            -
                    "en": "flair/ner-english-large"
         | 
| 64 | 
            -
                }
         | 
| 65 |  | 
| 66 | 
             
                PRESIDIO_EQUIVALENCES = {
         | 
| 67 | 
             
                    "PER": "PERSON",
         | 
| @@ -76,7 +74,7 @@ class FlairRecognizer(EntityRecognizer): | |
| 76 | 
             
                    supported_entities: Optional[List[str]] = None,
         | 
| 77 | 
             
                    check_label_groups: Optional[Tuple[Set, Set]] = None,
         | 
| 78 | 
             
                    model: SequenceTagger = None,
         | 
| 79 | 
            -
                    model_path: Optional[str] = None
         | 
| 80 | 
             
                ):
         | 
| 81 | 
             
                    self.check_label_groups = (
         | 
| 82 | 
             
                        check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
         | 
| @@ -93,7 +91,9 @@ class FlairRecognizer(EntityRecognizer): | |
| 93 | 
             
                        self.model = SequenceTagger.load(model_path)
         | 
| 94 | 
             
                    else:
         | 
| 95 | 
             
                        print(f"Loading model for language {supported_language}")
         | 
| 96 | 
            -
                        self.model = SequenceTagger.load( | 
|  | |
|  | |
| 97 |  | 
| 98 | 
             
                    super().__init__(
         | 
| 99 | 
             
                        supported_entities=supported_entities,
         | 
|  | |
| 59 | 
             
                    # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
         | 
| 60 | 
             
                ]
         | 
| 61 |  | 
| 62 | 
            +
                MODEL_LANGUAGES = {"en": "flair/ner-english-large"}
         | 
|  | |
|  | |
| 63 |  | 
| 64 | 
             
                PRESIDIO_EQUIVALENCES = {
         | 
| 65 | 
             
                    "PER": "PERSON",
         | 
|  | |
| 74 | 
             
                    supported_entities: Optional[List[str]] = None,
         | 
| 75 | 
             
                    check_label_groups: Optional[Tuple[Set, Set]] = None,
         | 
| 76 | 
             
                    model: SequenceTagger = None,
         | 
| 77 | 
            +
                    model_path: Optional[str] = None,
         | 
| 78 | 
             
                ):
         | 
| 79 | 
             
                    self.check_label_groups = (
         | 
| 80 | 
             
                        check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
         | 
|  | |
| 91 | 
             
                        self.model = SequenceTagger.load(model_path)
         | 
| 92 | 
             
                    else:
         | 
| 93 | 
             
                        print(f"Loading model for language {supported_language}")
         | 
| 94 | 
            +
                        self.model = SequenceTagger.load(
         | 
| 95 | 
            +
                            self.MODEL_LANGUAGES.get(supported_language)
         | 
| 96 | 
            +
                        )
         | 
| 97 |  | 
| 98 | 
             
                    super().__init__(
         | 
| 99 | 
             
                        supported_entities=supported_entities,
         | 
    	
        flair_test.py
    ADDED
    
    | @@ -0,0 +1,25 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Import generic wrappers
         | 
| 2 | 
            +
            from transformers import AutoModel, AutoTokenizer
         | 
| 3 | 
            +
             | 
| 4 | 
            +
             | 
| 5 | 
            +
            if __name__ == "__main__":
         | 
| 6 | 
            +
                from flair.data import Sentence
         | 
| 7 | 
            +
                from flair.models import SequenceTagger
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                # load tagger
         | 
| 10 | 
            +
                tagger = SequenceTagger.load("flair/ner-english-large")
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                # make example sentence
         | 
| 13 | 
            +
                sentence = Sentence("George Washington went to Washington")
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                # predict NER tags
         | 
| 16 | 
            +
                tagger.predict(sentence)
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                # print sentence
         | 
| 19 | 
            +
                print(sentence)
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                # print predicted NER spans
         | 
| 22 | 
            +
                print("The following NER tags are found:")
         | 
| 23 | 
            +
                # iterate over entities and print
         | 
| 24 | 
            +
                for entity in sentence.get_spans("ner"):
         | 
| 25 | 
            +
                    print(entity)
         | 
    	
        index.md
    CHANGED
    
    | @@ -5,22 +5,32 @@ The app is based on the [streamlit](https://streamlit.io/) package. | |
| 5 | 
             
            A live version can be found here: https://huggingface.co/spaces/presidio/presidio_demo
         | 
| 6 |  | 
| 7 | 
             
            ## Requirements
         | 
| 8 | 
            -
            1. Clone the repo and move to the `docs/samples/python/streamlit | 
| 9 | 
            -
             | 
| 10 |  | 
| 11 | 
             
            ```sh
         | 
| 12 | 
             
            pip install -r requirements
         | 
| 13 | 
             
            ```
         | 
| 14 | 
             
            > Note: This would install additional packages such as `transformers` and `flair` which are not mandatory for using Presidio.
         | 
| 15 |  | 
| 16 | 
            -
            2. 
         | 
| 17 | 
             
            3. *Optional*: Update the `analyzer_engine` and `anonymizer_engine` functions for your specific implementation (in `presidio_helpers.py`).
         | 
| 18 | 
            -
             | 
| 19 |  | 
| 20 | 
             
            ```sh
         | 
| 21 | 
             
            streamlit run presidio_streamlit.py
         | 
| 22 | 
             
            ```
         | 
| 23 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 24 | 
             
            ## Output
         | 
| 25 | 
             
            Output should be similar to this screenshot:
         | 
| 26 | 
            -
            
         | 
| 10 |  | 
| 11 | 
             
            ```sh
         | 
| 12 | 
             
            pip install -r requirements
         | 
| 13 | 
             
            ```
         | 
| 14 | 
             
            > Note: This would install additional packages such as `transformers` and `flair` which are not mandatory for using Presidio.
         | 
| 15 |  | 
|  | |
| 16 | 
             
            3. *Optional*: Update the `analyzer_engine` and `anonymizer_engine` functions for your specific implementation (in `presidio_helpers.py`).
         | 
| 17 | 
            +
            4. Start the app:
         | 
| 18 |  | 
| 19 | 
             
            ```sh
         | 
| 20 | 
             
            streamlit run presidio_streamlit.py
         | 
| 21 | 
             
            ```
         | 
| 22 |  | 
| 23 | 
            +
            5. Consider adding an `.env` file with the following environment variables, for further customizability:
         | 
| 24 | 
            +
            ```sh
         | 
| 25 | 
            +
            TA_KEY=YOUR_TEXT_ANALYTICS_KEY
         | 
| 26 | 
            +
            TA_ENDPOINT=YOUR_TEXT_ANALYTICS_ENDPOINT
         | 
| 27 | 
            +
            OPENAI_TYPE="Azure" #or "openai"
         | 
| 28 | 
            +
            OPENAI_KEY=YOUR_OPENAI_KEY
         | 
| 29 | 
            +
            OPENAI_API_VERSION = "2023-05-15"
         | 
| 30 | 
            +
            AZURE_OPENAI_ENDPOINT=YOUR_AZURE_OPENAI_AZURE_OPENAI_ENDPOINT
         | 
| 31 | 
            +
            AZURE_OPENAI_DEPLOYMENT=text-davinci-003
         | 
| 32 | 
            +
            ALLOW_OTHER_MODELS=true #true if the user could download new models
         | 
| 33 | 
            +
            ```
         | 
| 34 | 
             
            ## Output
         | 
| 35 | 
             
            Output should be similar to this screenshot:
         | 
| 36 | 
            +
            
         | 
    	
        openai_fake_data_generator.py
    CHANGED
    
    | @@ -2,51 +2,45 @@ from collections import namedtuple | |
| 2 | 
             
            from typing import Optional
         | 
| 3 |  | 
| 4 | 
             
            import openai
         | 
|  | |
| 5 | 
             
            import logging
         | 
| 6 |  | 
| 7 | 
             
            logger = logging.getLogger("presidio-streamlit")
         | 
| 8 |  | 
| 9 | 
             
            OpenAIParams = namedtuple(
         | 
| 10 | 
             
                "open_ai_params",
         | 
| 11 | 
            -
                ["openai_key", "model", "api_base", " | 
| 12 | 
             
            )
         | 
| 13 |  | 
| 14 |  | 
| 15 | 
            -
            def set_openai_params(openai_params: OpenAIParams):
         | 
| 16 | 
            -
                """Set the OpenAI API key.
         | 
| 17 | 
            -
                :param openai_params: OpenAIParams object with the following fields: key, model, api version, deployment_name,
         | 
| 18 | 
            -
                The latter only relate to Azure OpenAI deployments.
         | 
| 19 | 
            -
                """
         | 
| 20 | 
            -
                openai.api_key = openai_params.openai_key
         | 
| 21 | 
            -
                openai.api_version = openai_params.api_version
         | 
| 22 | 
            -
                if openai_params.api_base:
         | 
| 23 | 
            -
                    openai.api_base = openai_params.api_base
         | 
| 24 | 
            -
                    openai.api_type = openai_params.api_type
         | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
             
            def call_completion_model(
         | 
| 28 | 
             
                prompt: str,
         | 
| 29 | 
            -
                 | 
| 30 | 
            -
                max_tokens: int =  | 
| 31 | 
            -
                deployment_id: Optional[str] = None,
         | 
| 32 | 
             
            ) -> str:
         | 
| 33 | 
             
                """Creates a request for the OpenAI Completion service and returns the response.
         | 
| 34 |  | 
| 35 | 
             
                :param prompt: The prompt for the completion model
         | 
| 36 | 
            -
                :param  | 
| 37 | 
            -
                :param max_tokens:  | 
| 38 | 
            -
                :param deployment_id: Azure OpenAI deployment ID
         | 
| 39 | 
             
                """
         | 
| 40 | 
            -
                if  | 
| 41 | 
            -
                     | 
| 42 | 
            -
                         | 
|  | |
|  | |
|  | |
| 43 | 
             
                    )
         | 
| 44 | 
             
                else:
         | 
| 45 | 
            -
                     | 
| 46 | 
            -
                        model=model, prompt=prompt, max_tokens=max_tokens
         | 
| 47 | 
            -
                    )
         | 
| 48 |  | 
| 49 | 
            -
                 | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 50 |  | 
| 51 |  | 
| 52 | 
             
            def create_prompt(anonymized_text: str) -> str:
         | 
| @@ -64,17 +58,18 @@ def create_prompt(anonymized_text: str) -> str: | |
| 64 |  | 
| 65 | 
             
                a. Use completely random numbers, so every digit is drawn between 0 and 9.
         | 
| 66 | 
             
                b. Use realistic names that come from diverse genders, ethnicities and countries.
         | 
| 67 | 
            -
                c. If there are no placeholders, return the text as is | 
| 68 | 
             
                d. Keep the formatting as close to the original as possible.
         | 
| 69 | 
             
                e. If PII exists in the input, replace it with fake values in the output.
         | 
|  | |
| 70 |  | 
| 71 | 
            -
                input: How do I change the limit on my credit card {{credit_card_number}}?
         | 
| 72 | 
             
                output: How do I change the limit on my credit card 2539 3519 2345 1555?
         | 
| 73 | 
            -
                input: <PERSON> was the chief science officer at <ORGANIZATION>.
         | 
| 74 | 
             
                output: Katherine Buckjov was the chief science officer at NASA.
         | 
| 75 | 
            -
                input: Cameroon lives in <LOCATION>.
         | 
| 76 | 
             
                output: Vladimir lives in Moscow.
         | 
| 77 | 
            -
                 | 
| 78 | 
            -
                 | 
| 79 | 
            -
                """
         | 
| 80 | 
             
                return prompt
         | 
|  | |
| 2 | 
             
            from typing import Optional
         | 
| 3 |  | 
| 4 | 
             
            import openai
         | 
| 5 | 
            +
            from openai import OpenAI, AzureOpenAI
         | 
| 6 | 
             
            import logging
         | 
| 7 |  | 
| 8 | 
             
            logger = logging.getLogger("presidio-streamlit")
         | 
| 9 |  | 
| 10 | 
             
            OpenAIParams = namedtuple(
         | 
| 11 | 
             
                "open_ai_params",
         | 
| 12 | 
            +
                ["openai_key", "model", "api_base", "deployment_id", "api_version", "api_type"],
         | 
| 13 | 
             
            )
         | 
| 14 |  | 
| 15 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 16 | 
             
            def call_completion_model(
         | 
| 17 | 
             
                prompt: str,
         | 
| 18 | 
            +
                openai_params: OpenAIParams,
         | 
| 19 | 
            +
                max_tokens: Optional[int] = 256,
         | 
|  | |
| 20 | 
             
            ) -> str:
         | 
| 21 | 
             
                """Creates a request for the OpenAI Completion service and returns the response.
         | 
| 22 |  | 
| 23 | 
             
                :param prompt: The prompt for the completion model
         | 
| 24 | 
            +
                :param openai_params: OpenAI parameters for the completion model
         | 
| 25 | 
            +
                :param max_tokens: The maximum number of tokens to generate.
         | 
|  | |
| 26 | 
             
                """
         | 
| 27 | 
            +
                if openai_params.api_type.lower() == "azure":
         | 
| 28 | 
            +
                    client = AzureOpenAI(
         | 
| 29 | 
            +
                        api_version=openai_params.api_version,
         | 
| 30 | 
            +
                        api_key=openai_params.openai_key,
         | 
| 31 | 
            +
                        azure_endpoint=openai_params.api_base,
         | 
| 32 | 
            +
                        azure_deployment=openai_params.deployment_id,
         | 
| 33 | 
             
                    )
         | 
| 34 | 
             
                else:
         | 
| 35 | 
            +
                    client = OpenAI(api_key=openai_params.openai_key)
         | 
|  | |
|  | |
| 36 |  | 
| 37 | 
            +
                response = client.completions.create(
         | 
| 38 | 
            +
                    model=openai_params.model,
         | 
| 39 | 
            +
                    prompt=prompt,
         | 
| 40 | 
            +
                    max_tokens=max_tokens,
         | 
| 41 | 
            +
                )
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                return response.choices[0].text.strip()
         | 
| 44 |  | 
| 45 |  | 
| 46 | 
             
            def create_prompt(anonymized_text: str) -> str:
         | 
|  | |
| 58 |  | 
| 59 | 
             
                a. Use completely random numbers, so every digit is drawn between 0 and 9.
         | 
| 60 | 
             
                b. Use realistic names that come from diverse genders, ethnicities and countries.
         | 
| 61 | 
            +
                c. If there are no placeholders, return the text as is.
         | 
| 62 | 
             
                d. Keep the formatting as close to the original as possible.
         | 
| 63 | 
             
                e. If PII exists in the input, replace it with fake values in the output.
         | 
| 64 | 
            +
                f. Remove whitespace before and after the generated text
         | 
| 65 |  | 
| 66 | 
            +
                input: [[TEXT STARTS]] How do I change the limit on my credit card {{credit_card_number}}?[[TEXT ENDS]]
         | 
| 67 | 
             
                output: How do I change the limit on my credit card 2539 3519 2345 1555?
         | 
| 68 | 
            +
                input: [[TEXT STARTS]]<PERSON> was the chief science officer at <ORGANIZATION>.[[TEXT ENDS]]
         | 
| 69 | 
             
                output: Katherine Buckjov was the chief science officer at NASA.
         | 
| 70 | 
            +
                input: [[TEXT STARTS]]Cameroon lives in <LOCATION>.[[TEXT ENDS]]
         | 
| 71 | 
             
                output: Vladimir lives in Moscow.
         | 
| 72 | 
            +
                
         | 
| 73 | 
            +
                input: [[TEXT STARTS]]{anonymized_text}[[TEXT ENDS]]
         | 
| 74 | 
            +
                output:"""
         | 
| 75 | 
             
                return prompt
         | 
    	
        presidio_helpers.py
    CHANGED
    
    | @@ -16,16 +16,16 @@ from presidio_anonymizer import AnonymizerEngine | |
| 16 | 
             
            from presidio_anonymizer.entities import OperatorConfig
         | 
| 17 |  | 
| 18 | 
             
            from openai_fake_data_generator import (
         | 
| 19 | 
            -
                set_openai_params,
         | 
| 20 | 
             
                call_completion_model,
         | 
| 21 | 
            -
                create_prompt,
         | 
| 22 | 
             
                OpenAIParams,
         | 
|  | |
| 23 | 
             
            )
         | 
| 24 | 
             
            from presidio_nlp_engine_config import (
         | 
| 25 | 
             
                create_nlp_engine_with_spacy,
         | 
| 26 | 
             
                create_nlp_engine_with_flair,
         | 
| 27 | 
             
                create_nlp_engine_with_transformers,
         | 
| 28 | 
            -
                 | 
|  | |
| 29 | 
             
            )
         | 
| 30 |  | 
| 31 | 
             
            logger = logging.getLogger("presidio-streamlit")
         | 
| @@ -49,14 +49,16 @@ def nlp_engine_and_registry( | |
| 49 | 
             
                """
         | 
| 50 |  | 
| 51 | 
             
                # Set up NLP Engine according to the model of choice
         | 
| 52 | 
            -
                if " | 
| 53 | 
             
                    return create_nlp_engine_with_spacy(model_path)
         | 
| 54 | 
            -
                 | 
|  | |
|  | |
| 55 | 
             
                    return create_nlp_engine_with_flair(model_path)
         | 
| 56 | 
            -
                elif " | 
| 57 | 
             
                    return create_nlp_engine_with_transformers(model_path)
         | 
| 58 | 
            -
                elif " | 
| 59 | 
            -
                    return  | 
| 60 | 
             
                else:
         | 
| 61 | 
             
                    raise ValueError(f"Model family {model_family} not supported")
         | 
| 62 |  | 
| @@ -215,14 +217,9 @@ def create_fake_data( | |
| 215 | 
             
                if not openai_params.openai_key:
         | 
| 216 | 
             
                    return "Please provide your OpenAI key"
         | 
| 217 | 
             
                results = anonymize(text=text, operator="replace", analyze_results=analyze_results)
         | 
| 218 | 
            -
                set_openai_params(openai_params)
         | 
| 219 | 
             
                prompt = create_prompt(results.text)
         | 
| 220 | 
             
                print(f"Prompt: {prompt}")
         | 
| 221 | 
            -
                fake =  | 
| 222 | 
            -
                    prompt=prompt,
         | 
| 223 | 
            -
                    openai_model_name=openai_params.model,
         | 
| 224 | 
            -
                    openai_deployment_name=openai_params.deployment_name,
         | 
| 225 | 
            -
                )
         | 
| 226 | 
             
                return fake
         | 
| 227 |  | 
| 228 |  | 
|  | |
| 16 | 
             
            from presidio_anonymizer.entities import OperatorConfig
         | 
| 17 |  | 
| 18 | 
             
            from openai_fake_data_generator import (
         | 
|  | |
| 19 | 
             
                call_completion_model,
         | 
|  | |
| 20 | 
             
                OpenAIParams,
         | 
| 21 | 
            +
                create_prompt,
         | 
| 22 | 
             
            )
         | 
| 23 | 
             
            from presidio_nlp_engine_config import (
         | 
| 24 | 
             
                create_nlp_engine_with_spacy,
         | 
| 25 | 
             
                create_nlp_engine_with_flair,
         | 
| 26 | 
             
                create_nlp_engine_with_transformers,
         | 
| 27 | 
            +
                create_nlp_engine_with_azure_ai_language,
         | 
| 28 | 
            +
                create_nlp_engine_with_stanza,
         | 
| 29 | 
             
            )
         | 
| 30 |  | 
| 31 | 
             
            logger = logging.getLogger("presidio-streamlit")
         | 
|  | |
| 49 | 
             
                """
         | 
| 50 |  | 
| 51 | 
             
                # Set up NLP Engine according to the model of choice
         | 
| 52 | 
            +
                if "spacy" in model_family.lower():
         | 
| 53 | 
             
                    return create_nlp_engine_with_spacy(model_path)
         | 
| 54 | 
            +
                if "stanza" in model_family.lower():
         | 
| 55 | 
            +
                    return create_nlp_engine_with_stanza(model_path)
         | 
| 56 | 
            +
                elif "flair" in model_family.lower():
         | 
| 57 | 
             
                    return create_nlp_engine_with_flair(model_path)
         | 
| 58 | 
            +
                elif "huggingface" in model_family.lower():
         | 
| 59 | 
             
                    return create_nlp_engine_with_transformers(model_path)
         | 
| 60 | 
            +
                elif "azure ai language" in model_family.lower():
         | 
| 61 | 
            +
                    return create_nlp_engine_with_azure_ai_language(ta_key, ta_endpoint)
         | 
| 62 | 
             
                else:
         | 
| 63 | 
             
                    raise ValueError(f"Model family {model_family} not supported")
         | 
| 64 |  | 
|  | |
| 217 | 
             
                if not openai_params.openai_key:
         | 
| 218 | 
             
                    return "Please provide your OpenAI key"
         | 
| 219 | 
             
                results = anonymize(text=text, operator="replace", analyze_results=analyze_results)
         | 
|  | |
| 220 | 
             
                prompt = create_prompt(results.text)
         | 
| 221 | 
             
                print(f"Prompt: {prompt}")
         | 
| 222 | 
            +
                fake = call_completion_model(prompt=prompt, openai_params=openai_params)
         | 
|  | |
|  | |
|  | |
|  | |
| 223 | 
             
                return fake
         | 
| 224 |  | 
| 225 |  | 
    	
        presidio_nlp_engine_config.py
    CHANGED
    
    | @@ -1,8 +1,12 @@ | |
| 1 | 
            -
            from typing import Tuple
         | 
| 2 | 
             
            import logging
         | 
|  | |
|  | |
| 3 | 
             
            import spacy
         | 
| 4 | 
             
            from presidio_analyzer import RecognizerRegistry
         | 
| 5 | 
            -
            from presidio_analyzer.nlp_engine import  | 
|  | |
|  | |
|  | |
| 6 |  | 
| 7 | 
             
            logger = logging.getLogger("presidio-streamlit")
         | 
| 8 |  | 
| @@ -12,21 +16,70 @@ def create_nlp_engine_with_spacy( | |
| 12 | 
             
            ) -> Tuple[NlpEngine, RecognizerRegistry]:
         | 
| 13 | 
             
                """
         | 
| 14 | 
             
                Instantiate an NlpEngine with a spaCy model
         | 
| 15 | 
            -
                :param model_path:  | 
| 16 | 
             
                """
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 17 | 
             
                registry = RecognizerRegistry()
         | 
| 18 | 
            -
                registry.load_predefined_recognizers()
         | 
| 19 |  | 
| 20 | 
            -
                 | 
| 21 | 
            -
                    spacy.cli.download(model_path)
         | 
| 22 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 23 | 
             
                nlp_configuration = {
         | 
| 24 | 
            -
                    "nlp_engine_name": " | 
| 25 | 
             
                    "models": [{"lang_code": "en", "model_name": model_path}],
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 26 | 
             
                }
         | 
| 27 |  | 
| 28 | 
             
                nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
         | 
| 29 |  | 
|  | |
|  | |
|  | |
| 30 | 
             
                return nlp_engine, registry
         | 
| 31 |  | 
| 32 |  | 
| @@ -39,41 +92,62 @@ def create_nlp_engine_with_transformers( | |
| 39 | 
             
                would return NlpArtifacts such as POS and lemmas.
         | 
| 40 | 
             
                :param model_path: HuggingFace model path.
         | 
| 41 | 
             
                """
         | 
|  | |
| 42 |  | 
| 43 | 
            -
                from transformers_rec import (
         | 
| 44 | 
            -
                    STANFORD_COFIGURATION,
         | 
| 45 | 
            -
                    BERT_DEID_CONFIGURATION,
         | 
| 46 | 
            -
                    TransformersRecognizer,
         | 
| 47 | 
            -
                )
         | 
| 48 | 
            -
             | 
| 49 | 
            -
                registry = RecognizerRegistry()
         | 
| 50 | 
            -
                registry.load_predefined_recognizers()
         | 
| 51 | 
            -
             | 
| 52 | 
            -
                if not spacy.util.is_package("en_core_web_sm"):
         | 
| 53 | 
            -
                    spacy.cli.download("en_core_web_sm")
         | 
| 54 | 
            -
                # Using a small spaCy model + a HF NER model
         | 
| 55 | 
            -
                transformers_recognizer = TransformersRecognizer(model_path=model_path)
         | 
| 56 | 
            -
             | 
| 57 | 
            -
                if model_path == "StanfordAIMI/stanford-deidentifier-base":
         | 
| 58 | 
            -
                    transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
         | 
| 59 | 
            -
                elif model_path == "obi/deid_roberta_i2b2":
         | 
| 60 | 
            -
                    transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
         | 
| 61 | 
            -
                else:
         | 
| 62 | 
            -
                    print(f"Warning: Model has no configuration, loading default.")
         | 
| 63 | 
            -
                    transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
         | 
| 64 | 
            -
             | 
| 65 | 
            -
                # Use small spaCy model, no need for both spacy and HF models
         | 
| 66 | 
            -
                # The transformers model is used here as a recognizer, not as an NlpEngine
         | 
| 67 | 
             
                nlp_configuration = {
         | 
| 68 | 
            -
                    "nlp_engine_name": " | 
| 69 | 
            -
                    "models": [ | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 70 | 
             
                }
         | 
| 71 |  | 
| 72 | 
            -
                registry.add_recognizer(transformers_recognizer)
         | 
| 73 | 
            -
                registry.remove_recognizer("SpacyRecognizer")
         | 
| 74 | 
            -
             | 
| 75 | 
             
                nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
         | 
| 76 |  | 
|  | |
|  | |
|  | |
| 77 | 
             
                return nlp_engine, registry
         | 
| 78 |  | 
| 79 |  | 
| @@ -91,6 +165,8 @@ def create_nlp_engine_with_flair( | |
| 91 | 
             
                registry = RecognizerRegistry()
         | 
| 92 | 
             
                registry.load_predefined_recognizers()
         | 
| 93 |  | 
|  | |
|  | |
| 94 | 
             
                if not spacy.util.is_package("en_core_web_sm"):
         | 
| 95 | 
             
                    spacy.cli.download("en_core_web_sm")
         | 
| 96 | 
             
                # Using a small spaCy model + a Flair NER model
         | 
| @@ -107,7 +183,7 @@ def create_nlp_engine_with_flair( | |
| 107 | 
             
                return nlp_engine, registry
         | 
| 108 |  | 
| 109 |  | 
| 110 | 
            -
            def  | 
| 111 | 
             
                """
         | 
| 112 | 
             
                Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
         | 
| 113 | 
             
                The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model
         | 
| @@ -115,7 +191,7 @@ def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str): | |
| 115 | 
             
                :param ta_key: Azure Text Analytics key.
         | 
| 116 | 
             
                :param ta_endpoint: Azure Text Analytics endpoint.
         | 
| 117 | 
             
                """
         | 
| 118 | 
            -
                from  | 
| 119 |  | 
| 120 | 
             
                if not ta_key or not ta_endpoint:
         | 
| 121 | 
             
                    raise RuntimeError("Please fill in the Text Analytics endpoint details")
         | 
| @@ -123,7 +199,9 @@ def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str): | |
| 123 | 
             
                registry = RecognizerRegistry()
         | 
| 124 | 
             
                registry.load_predefined_recognizers()
         | 
| 125 |  | 
| 126 | 
            -
                 | 
|  | |
|  | |
| 127 | 
             
                nlp_configuration = {
         | 
| 128 | 
             
                    "nlp_engine_name": "spacy",
         | 
| 129 | 
             
                    "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
         | 
| @@ -131,7 +209,7 @@ def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str): | |
| 131 |  | 
| 132 | 
             
                nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
         | 
| 133 |  | 
| 134 | 
            -
                registry.add_recognizer( | 
| 135 | 
             
                registry.remove_recognizer("SpacyRecognizer")
         | 
| 136 |  | 
| 137 | 
             
                return nlp_engine, registry
         | 
|  | |
|  | |
| 1 | 
             
            import logging
         | 
| 2 | 
            +
            from typing import Tuple
         | 
| 3 | 
            +
             | 
| 4 | 
             
            import spacy
         | 
| 5 | 
             
            from presidio_analyzer import RecognizerRegistry
         | 
| 6 | 
            +
            from presidio_analyzer.nlp_engine import (
         | 
| 7 | 
            +
                NlpEngine,
         | 
| 8 | 
            +
                NlpEngineProvider,
         | 
| 9 | 
            +
            )
         | 
| 10 |  | 
| 11 | 
             
            logger = logging.getLogger("presidio-streamlit")
         | 
| 12 |  | 
|  | |
| 16 | 
             
            ) -> Tuple[NlpEngine, RecognizerRegistry]:
         | 
| 17 | 
             
                """
         | 
| 18 | 
             
                Instantiate an NlpEngine with a spaCy model
         | 
| 19 | 
            +
                :param model_path: path to model / model name.
         | 
| 20 | 
             
                """
         | 
| 21 | 
            +
                nlp_configuration = {
         | 
| 22 | 
            +
                    "nlp_engine_name": "spacy",
         | 
| 23 | 
            +
                    "models": [{"lang_code": "en", "model_name": model_path}],
         | 
| 24 | 
            +
                    "ner_model_configuration": {
         | 
| 25 | 
            +
                        "model_to_presidio_entity_mapping": {
         | 
| 26 | 
            +
                            "PER": "PERSON",
         | 
| 27 | 
            +
                            "PERSON": "PERSON",
         | 
| 28 | 
            +
                            "NORP": "NRP",
         | 
| 29 | 
            +
                            "FAC": "FACILITY",
         | 
| 30 | 
            +
                            "LOC": "LOCATION",
         | 
| 31 | 
            +
                            "GPE": "LOCATION",
         | 
| 32 | 
            +
                            "LOCATION": "LOCATION",
         | 
| 33 | 
            +
                            "ORG": "ORGANIZATION",
         | 
| 34 | 
            +
                            "ORGANIZATION": "ORGANIZATION",
         | 
| 35 | 
            +
                            "DATE": "DATE_TIME",
         | 
| 36 | 
            +
                            "TIME": "DATE_TIME",
         | 
| 37 | 
            +
                        },
         | 
| 38 | 
            +
                        "low_confidence_score_multiplier": 0.4,
         | 
| 39 | 
            +
                        "low_score_entity_names": ["ORG", "ORGANIZATION"],
         | 
| 40 | 
            +
                    },
         | 
| 41 | 
            +
                }
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
         | 
| 44 | 
            +
             | 
| 45 | 
             
                registry = RecognizerRegistry()
         | 
| 46 | 
            +
                registry.load_predefined_recognizers(nlp_engine=nlp_engine)
         | 
| 47 |  | 
| 48 | 
            +
                return nlp_engine, registry
         | 
|  | |
| 49 |  | 
| 50 | 
            +
             | 
| 51 | 
            +
            def create_nlp_engine_with_stanza(
         | 
| 52 | 
            +
                model_path: str,
         | 
| 53 | 
            +
            ) -> Tuple[NlpEngine, RecognizerRegistry]:
         | 
| 54 | 
            +
                """
         | 
| 55 | 
            +
                Instantiate an NlpEngine with a stanza model
         | 
| 56 | 
            +
                :param model_path: path to model / model name.
         | 
| 57 | 
            +
                """
         | 
| 58 | 
             
                nlp_configuration = {
         | 
| 59 | 
            +
                    "nlp_engine_name": "stanza",
         | 
| 60 | 
             
                    "models": [{"lang_code": "en", "model_name": model_path}],
         | 
| 61 | 
            +
                    "ner_model_configuration": {
         | 
| 62 | 
            +
                        "model_to_presidio_entity_mapping": {
         | 
| 63 | 
            +
                            "PER": "PERSON",
         | 
| 64 | 
            +
                            "PERSON": "PERSON",
         | 
| 65 | 
            +
                            "NORP": "NRP",
         | 
| 66 | 
            +
                            "FAC": "FACILITY",
         | 
| 67 | 
            +
                            "LOC": "LOCATION",
         | 
| 68 | 
            +
                            "GPE": "LOCATION",
         | 
| 69 | 
            +
                            "LOCATION": "LOCATION",
         | 
| 70 | 
            +
                            "ORG": "ORGANIZATION",
         | 
| 71 | 
            +
                            "ORGANIZATION": "ORGANIZATION",
         | 
| 72 | 
            +
                            "DATE": "DATE_TIME",
         | 
| 73 | 
            +
                            "TIME": "DATE_TIME",
         | 
| 74 | 
            +
                        }
         | 
| 75 | 
            +
                    },
         | 
| 76 | 
             
                }
         | 
| 77 |  | 
| 78 | 
             
                nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
         | 
| 79 |  | 
| 80 | 
            +
                registry = RecognizerRegistry()
         | 
| 81 | 
            +
                registry.load_predefined_recognizers(nlp_engine=nlp_engine)
         | 
| 82 | 
            +
             | 
| 83 | 
             
                return nlp_engine, registry
         | 
| 84 |  | 
| 85 |  | 
|  | |
| 92 | 
             
                would return NlpArtifacts such as POS and lemmas.
         | 
| 93 | 
             
                :param model_path: HuggingFace model path.
         | 
| 94 | 
             
                """
         | 
| 95 | 
            +
                print(f"Loading Transformers model: {model_path} of type {type(model_path)}")
         | 
| 96 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 97 | 
             
                nlp_configuration = {
         | 
| 98 | 
            +
                    "nlp_engine_name": "transformers",
         | 
| 99 | 
            +
                    "models": [
         | 
| 100 | 
            +
                        {
         | 
| 101 | 
            +
                            "lang_code": "en",
         | 
| 102 | 
            +
                            "model_name": {"spacy": "en_core_web_sm", "transformers": model_path},
         | 
| 103 | 
            +
                        }
         | 
| 104 | 
            +
                    ],
         | 
| 105 | 
            +
                    "ner_model_configuration": {
         | 
| 106 | 
            +
                        "model_to_presidio_entity_mapping": {
         | 
| 107 | 
            +
                            "PER": "PERSON",
         | 
| 108 | 
            +
                            "PERSON": "PERSON",
         | 
| 109 | 
            +
                            "LOC": "LOCATION",
         | 
| 110 | 
            +
                            "LOCATION": "LOCATION",
         | 
| 111 | 
            +
                            "GPE": "LOCATION",
         | 
| 112 | 
            +
                            "ORG": "ORGANIZATION",
         | 
| 113 | 
            +
                            "ORGANIZATION": "ORGANIZATION",
         | 
| 114 | 
            +
                            "NORP": "NRP",
         | 
| 115 | 
            +
                            "AGE": "AGE",
         | 
| 116 | 
            +
                            "ID": "ID",
         | 
| 117 | 
            +
                            "EMAIL": "EMAIL",
         | 
| 118 | 
            +
                            "PATIENT": "PERSON",
         | 
| 119 | 
            +
                            "STAFF": "PERSON",
         | 
| 120 | 
            +
                            "HOSP": "ORGANIZATION",
         | 
| 121 | 
            +
                            "PATORG": "ORGANIZATION",
         | 
| 122 | 
            +
                            "DATE": "DATE_TIME",
         | 
| 123 | 
            +
                            "TIME": "DATE_TIME",
         | 
| 124 | 
            +
                            "PHONE": "PHONE_NUMBER",
         | 
| 125 | 
            +
                            "HCW": "PERSON",
         | 
| 126 | 
            +
                            "HOSPITAL": "ORGANIZATION",
         | 
| 127 | 
            +
                            "FACILITY": "LOCATION",
         | 
| 128 | 
            +
                        },
         | 
| 129 | 
            +
                        "low_confidence_score_multiplier": 0.4,
         | 
| 130 | 
            +
                        "low_score_entity_names": ["ID"],
         | 
| 131 | 
            +
                        "labels_to_ignore": [
         | 
| 132 | 
            +
                            "CARDINAL",
         | 
| 133 | 
            +
                            "EVENT",
         | 
| 134 | 
            +
                            "LANGUAGE",
         | 
| 135 | 
            +
                            "LAW",
         | 
| 136 | 
            +
                            "MONEY",
         | 
| 137 | 
            +
                            "ORDINAL",
         | 
| 138 | 
            +
                            "PERCENT",
         | 
| 139 | 
            +
                            "PRODUCT",
         | 
| 140 | 
            +
                            "QUANTITY",
         | 
| 141 | 
            +
                            "WORK_OF_ART",
         | 
| 142 | 
            +
                        ],
         | 
| 143 | 
            +
                    },
         | 
| 144 | 
             
                }
         | 
| 145 |  | 
|  | |
|  | |
|  | |
| 146 | 
             
                nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
         | 
| 147 |  | 
| 148 | 
            +
                registry = RecognizerRegistry()
         | 
| 149 | 
            +
                registry.load_predefined_recognizers(nlp_engine=nlp_engine)
         | 
| 150 | 
            +
             | 
| 151 | 
             
                return nlp_engine, registry
         | 
| 152 |  | 
| 153 |  | 
|  | |
| 165 | 
             
                registry = RecognizerRegistry()
         | 
| 166 | 
             
                registry.load_predefined_recognizers()
         | 
| 167 |  | 
| 168 | 
            +
                # there is no official Flair NlpEngine, hence we load it as an additional recognizer
         | 
| 169 | 
            +
             | 
| 170 | 
             
                if not spacy.util.is_package("en_core_web_sm"):
         | 
| 171 | 
             
                    spacy.cli.download("en_core_web_sm")
         | 
| 172 | 
             
                # Using a small spaCy model + a Flair NER model
         | 
|  | |
| 183 | 
             
                return nlp_engine, registry
         | 
| 184 |  | 
| 185 |  | 
| 186 | 
            +
            def create_nlp_engine_with_azure_ai_language(ta_key: str, ta_endpoint: str):
         | 
| 187 | 
             
                """
         | 
| 188 | 
             
                Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
         | 
| 189 | 
             
                The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model
         | 
|  | |
| 191 | 
             
                :param ta_key: Azure Text Analytics key.
         | 
| 192 | 
             
                :param ta_endpoint: Azure Text Analytics endpoint.
         | 
| 193 | 
             
                """
         | 
| 194 | 
            +
                from azure_ai_language_wrapper import AzureAIServiceWrapper
         | 
| 195 |  | 
| 196 | 
             
                if not ta_key or not ta_endpoint:
         | 
| 197 | 
             
                    raise RuntimeError("Please fill in the Text Analytics endpoint details")
         | 
|  | |
| 199 | 
             
                registry = RecognizerRegistry()
         | 
| 200 | 
             
                registry.load_predefined_recognizers()
         | 
| 201 |  | 
| 202 | 
            +
                azure_ai_language_recognizer = AzureAIServiceWrapper(
         | 
| 203 | 
            +
                    ta_endpoint=ta_endpoint, ta_key=ta_key
         | 
| 204 | 
            +
                )
         | 
| 205 | 
             
                nlp_configuration = {
         | 
| 206 | 
             
                    "nlp_engine_name": "spacy",
         | 
| 207 | 
             
                    "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
         | 
|  | |
| 209 |  | 
| 210 | 
             
                nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
         | 
| 211 |  | 
| 212 | 
            +
                registry.add_recognizer(azure_ai_language_recognizer)
         | 
| 213 | 
             
                registry.remove_recognizer("SpacyRecognizer")
         | 
| 214 |  | 
| 215 | 
             
                return nlp_engine, registry
         | 
    	
        presidio_streamlit.py
    CHANGED
    
    | @@ -56,7 +56,8 @@ model_list = [ | |
| 56 | 
             
                "flair/ner-english-large",
         | 
| 57 | 
             
                "HuggingFace/obi/deid_roberta_i2b2",
         | 
| 58 | 
             
                "HuggingFace/StanfordAIMI/stanford-deidentifier-base",
         | 
| 59 | 
            -
                " | 
|  | |
| 60 | 
             
                "Other",
         | 
| 61 | 
             
            ]
         | 
| 62 | 
             
            if not allow_other_models:
         | 
| @@ -75,22 +76,22 @@ st_model_package = st_model.split("/")[0] | |
| 75 | 
             
            # Remove package prefix (if needed)
         | 
| 76 | 
             
            st_model = (
         | 
| 77 | 
             
                st_model
         | 
| 78 | 
            -
                if st_model_package not in (" | 
| 79 | 
             
                else "/".join(st_model.split("/")[1:])
         | 
| 80 | 
             
            )
         | 
| 81 |  | 
| 82 | 
             
            if st_model == "Other":
         | 
| 83 | 
             
                st_model_package = st.sidebar.selectbox(
         | 
| 84 | 
            -
                    "NER model OSS package", options=["spaCy", "Flair", "HuggingFace"]
         | 
| 85 | 
             
                )
         | 
| 86 | 
             
                st_model = st.sidebar.text_input(f"NER model name", value="")
         | 
| 87 |  | 
| 88 | 
            -
            if st_model == "Azure  | 
| 89 | 
             
                st_ta_key = st.sidebar.text_input(
         | 
| 90 | 
            -
                    f" | 
| 91 | 
             
                )
         | 
| 92 | 
             
                st_ta_endpoint = st.sidebar.text_input(
         | 
| 93 | 
            -
                    f" | 
| 94 | 
             
                    value=os.getenv("TA_ENDPOINT", default=""),
         | 
| 95 | 
             
                    help="For more info: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/overview",  # noqa: E501
         | 
| 96 | 
             
                )
         | 
| @@ -124,23 +125,18 @@ open_ai_params = None | |
| 124 |  | 
| 125 | 
             
            logger.debug(f"st_operator: {st_operator}")
         | 
| 126 |  | 
| 127 | 
            -
             | 
| 128 | 
            -
             | 
| 129 | 
            -
             | 
| 130 | 
            -
             | 
| 131 | 
            -
                st_mask_char = st.sidebar.text_input(
         | 
| 132 | 
            -
                    "Mask character", value=st_mask_char, max_chars=1
         | 
| 133 | 
            -
                )
         | 
| 134 | 
            -
            elif st_operator == "encrypt":
         | 
| 135 | 
            -
                st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
         | 
| 136 | 
            -
            elif st_operator == "synthesize":
         | 
| 137 | 
             
                if os.getenv("OPENAI_TYPE", default="openai") == "Azure":
         | 
| 138 | 
             
                    openai_api_type = "azure"
         | 
| 139 | 
             
                    st_openai_api_base = st.sidebar.text_input(
         | 
| 140 | 
             
                        "Azure OpenAI base URL",
         | 
| 141 | 
             
                        value=os.getenv("AZURE_OPENAI_ENDPOINT", default=""),
         | 
| 142 | 
             
                    )
         | 
| 143 | 
            -
                     | 
|  | |
| 144 | 
             
                        "Deployment name", value=os.getenv("AZURE_OPENAI_DEPLOYMENT", default="")
         | 
| 145 | 
             
                    )
         | 
| 146 | 
             
                    st_openai_version = st.sidebar.text_input(
         | 
| @@ -148,11 +144,13 @@ elif st_operator == "synthesize": | |
| 148 | 
             
                        value=os.getenv("OPENAI_API_VERSION", default="2023-05-15"),
         | 
| 149 | 
             
                    )
         | 
| 150 | 
             
                else:
         | 
| 151 | 
            -
                     | 
| 152 | 
            -
                     | 
|  | |
|  | |
| 153 | 
             
                st_openai_key = st.sidebar.text_input(
         | 
| 154 | 
             
                    "OPENAI_KEY",
         | 
| 155 | 
            -
                    value= | 
| 156 | 
             
                    help="See https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key for more info.",
         | 
| 157 | 
             
                    type="password",
         | 
| 158 | 
             
                )
         | 
| @@ -161,12 +159,40 @@ elif st_operator == "synthesize": | |
| 161 | 
             
                    value=os.getenv("OPENAI_MODEL", default="text-davinci-003"),
         | 
| 162 | 
             
                    help="See more here: https://platform.openai.com/docs/models/",
         | 
| 163 | 
             
                )
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 164 |  | 
| 165 | 
             
                open_ai_params = OpenAIParams(
         | 
| 166 | 
             
                    openai_key=st_openai_key,
         | 
| 167 | 
             
                    model=st_openai_model,
         | 
| 168 | 
             
                    api_base=st_openai_api_base,
         | 
| 169 | 
            -
                     | 
| 170 | 
             
                    api_version=st_openai_version,
         | 
| 171 | 
             
                    api_type=openai_api_type,
         | 
| 172 | 
             
                )
         | 
| @@ -214,7 +240,8 @@ with st.expander("About this demo", expanded=False): | |
| 214 | 
             
                    \n\n[Code](https://aka.ms/presidio) | 
         | 
| 215 | 
             
                    [Tutorial](https://microsoft.github.io/presidio/tutorial/) | 
         | 
| 216 | 
             
                    [Installation](https://microsoft.github.io/presidio/installation/) | 
         | 
| 217 | 
            -
                    [FAQ](https://microsoft.github.io/presidio/faq/) | | 
|  | |
| 218 | 
             
                )
         | 
| 219 |  | 
| 220 | 
             
                st.info(
         | 
|  | |
| 56 | 
             
                "flair/ner-english-large",
         | 
| 57 | 
             
                "HuggingFace/obi/deid_roberta_i2b2",
         | 
| 58 | 
             
                "HuggingFace/StanfordAIMI/stanford-deidentifier-base",
         | 
| 59 | 
            +
                "stanza/en",
         | 
| 60 | 
            +
                "Azure AI Language",
         | 
| 61 | 
             
                "Other",
         | 
| 62 | 
             
            ]
         | 
| 63 | 
             
            if not allow_other_models:
         | 
|  | |
| 76 | 
             
            # Remove package prefix (if needed)
         | 
| 77 | 
             
            st_model = (
         | 
| 78 | 
             
                st_model
         | 
| 79 | 
            +
                if st_model_package.lower() not in ("spacy", "stanza", "huggingface")
         | 
| 80 | 
             
                else "/".join(st_model.split("/")[1:])
         | 
| 81 | 
             
            )
         | 
| 82 |  | 
| 83 | 
             
            if st_model == "Other":
         | 
| 84 | 
             
                st_model_package = st.sidebar.selectbox(
         | 
| 85 | 
            +
                    "NER model OSS package", options=["spaCy", "stanza", "Flair", "HuggingFace"]
         | 
| 86 | 
             
                )
         | 
| 87 | 
             
                st_model = st.sidebar.text_input(f"NER model name", value="")
         | 
| 88 |  | 
| 89 | 
            +
            if st_model == "Azure AI Language":
         | 
| 90 | 
             
                st_ta_key = st.sidebar.text_input(
         | 
| 91 | 
            +
                    f"Azure AI Language key", value=os.getenv("TA_KEY", ""), type="password"
         | 
| 92 | 
             
                )
         | 
| 93 | 
             
                st_ta_endpoint = st.sidebar.text_input(
         | 
| 94 | 
            +
                    f"Azure AI Language endpoint",
         | 
| 95 | 
             
                    value=os.getenv("TA_ENDPOINT", default=""),
         | 
| 96 | 
             
                    help="For more info: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/overview",  # noqa: E501
         | 
| 97 | 
             
                )
         | 
|  | |
| 125 |  | 
| 126 | 
             
            logger.debug(f"st_operator: {st_operator}")
         | 
| 127 |  | 
| 128 | 
            +
             | 
| 129 | 
            +
            def set_up_openai_synthesis():
         | 
| 130 | 
            +
                """Set up the OpenAI API key and model for text synthesis."""
         | 
| 131 | 
            +
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 132 | 
             
                if os.getenv("OPENAI_TYPE", default="openai") == "Azure":
         | 
| 133 | 
             
                    openai_api_type = "azure"
         | 
| 134 | 
             
                    st_openai_api_base = st.sidebar.text_input(
         | 
| 135 | 
             
                        "Azure OpenAI base URL",
         | 
| 136 | 
             
                        value=os.getenv("AZURE_OPENAI_ENDPOINT", default=""),
         | 
| 137 | 
             
                    )
         | 
| 138 | 
            +
                    openai_key = os.getenv("AZURE_OPENAI_KEY", default="")
         | 
| 139 | 
            +
                    st_deployment_id = st.sidebar.text_input(
         | 
| 140 | 
             
                        "Deployment name", value=os.getenv("AZURE_OPENAI_DEPLOYMENT", default="")
         | 
| 141 | 
             
                    )
         | 
| 142 | 
             
                    st_openai_version = st.sidebar.text_input(
         | 
|  | |
| 144 | 
             
                        value=os.getenv("OPENAI_API_VERSION", default="2023-05-15"),
         | 
| 145 | 
             
                    )
         | 
| 146 | 
             
                else:
         | 
| 147 | 
            +
                    openai_api_type = "openai"
         | 
| 148 | 
            +
                    st_openai_version = st_openai_api_base = None
         | 
| 149 | 
            +
                    st_deployment_id = ""
         | 
| 150 | 
            +
                    openai_key = os.getenv("OPENAI_KEY", default="")
         | 
| 151 | 
             
                st_openai_key = st.sidebar.text_input(
         | 
| 152 | 
             
                    "OPENAI_KEY",
         | 
| 153 | 
            +
                    value=openai_key,
         | 
| 154 | 
             
                    help="See https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key for more info.",
         | 
| 155 | 
             
                    type="password",
         | 
| 156 | 
             
                )
         | 
|  | |
| 159 | 
             
                    value=os.getenv("OPENAI_MODEL", default="text-davinci-003"),
         | 
| 160 | 
             
                    help="See more here: https://platform.openai.com/docs/models/",
         | 
| 161 | 
             
                )
         | 
| 162 | 
            +
                return (
         | 
| 163 | 
            +
                    openai_api_type,
         | 
| 164 | 
            +
                    st_openai_api_base,
         | 
| 165 | 
            +
                    st_deployment_id,
         | 
| 166 | 
            +
                    st_openai_version,
         | 
| 167 | 
            +
                    st_openai_key,
         | 
| 168 | 
            +
                    st_openai_model,
         | 
| 169 | 
            +
                )
         | 
| 170 | 
            +
             | 
| 171 | 
            +
             | 
| 172 | 
            +
            if st_operator == "mask":
         | 
| 173 | 
            +
                st_number_of_chars = st.sidebar.number_input(
         | 
| 174 | 
            +
                    "number of chars", value=st_number_of_chars, min_value=0, max_value=100
         | 
| 175 | 
            +
                )
         | 
| 176 | 
            +
                st_mask_char = st.sidebar.text_input(
         | 
| 177 | 
            +
                    "Mask character", value=st_mask_char, max_chars=1
         | 
| 178 | 
            +
                )
         | 
| 179 | 
            +
            elif st_operator == "encrypt":
         | 
| 180 | 
            +
                st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
         | 
| 181 | 
            +
            elif st_operator == "synthesize":
         | 
| 182 | 
            +
                (
         | 
| 183 | 
            +
                    openai_api_type,
         | 
| 184 | 
            +
                    st_openai_api_base,
         | 
| 185 | 
            +
                    st_deployment_id,
         | 
| 186 | 
            +
                    st_openai_version,
         | 
| 187 | 
            +
                    st_openai_key,
         | 
| 188 | 
            +
                    st_openai_model,
         | 
| 189 | 
            +
                ) = set_up_openai_synthesis()
         | 
| 190 |  | 
| 191 | 
             
                open_ai_params = OpenAIParams(
         | 
| 192 | 
             
                    openai_key=st_openai_key,
         | 
| 193 | 
             
                    model=st_openai_model,
         | 
| 194 | 
             
                    api_base=st_openai_api_base,
         | 
| 195 | 
            +
                    deployment_id=st_deployment_id,
         | 
| 196 | 
             
                    api_version=st_openai_version,
         | 
| 197 | 
             
                    api_type=openai_api_type,
         | 
| 198 | 
             
                )
         | 
|  | |
| 240 | 
             
                    \n\n[Code](https://aka.ms/presidio) | 
         | 
| 241 | 
             
                    [Tutorial](https://microsoft.github.io/presidio/tutorial/) | 
         | 
| 242 | 
             
                    [Installation](https://microsoft.github.io/presidio/installation/) | 
         | 
| 243 | 
            +
                    [FAQ](https://microsoft.github.io/presidio/faq/) |
         | 
| 244 | 
            +
                    [Feedback](https://forms.office.com/r/9ufyYjfDaY) |"""
         | 
| 245 | 
             
                )
         | 
| 246 |  | 
| 247 | 
             
                st.info(
         | 
    	
        requirements.txt
    CHANGED
    
    | @@ -1,4 +1,5 @@ | |
| 1 | 
            -
            presidio-analyzer
         | 
|  | |
| 2 | 
             
            presidio-anonymizer
         | 
| 3 | 
             
            streamlit
         | 
| 4 | 
             
            streamlit-tags
         | 
| @@ -6,8 +7,6 @@ pandas | |
| 6 | 
             
            python-dotenv
         | 
| 7 | 
             
            st-annotated-text
         | 
| 8 | 
             
            torch
         | 
| 9 | 
            -
            transformers
         | 
| 10 | 
             
            flair
         | 
| 11 | 
             
            openai
         | 
| 12 | 
            -
            spacy
         | 
| 13 | 
             
            azure-ai-textanalytics
         | 
|  | |
| 1 | 
            +
            presidio-analyzer[transformers]
         | 
| 2 | 
            +
            presidio-analyzer[stanza]
         | 
| 3 | 
             
            presidio-anonymizer
         | 
| 4 | 
             
            streamlit
         | 
| 5 | 
             
            streamlit-tags
         | 
|  | |
| 7 | 
             
            python-dotenv
         | 
| 8 | 
             
            st-annotated-text
         | 
| 9 | 
             
            torch
         | 
|  | |
| 10 | 
             
            flair
         | 
| 11 | 
             
            openai
         | 
|  | |
| 12 | 
             
            azure-ai-textanalytics
         | 
    	
        test_streamlit.py
    ADDED
    
    | @@ -0,0 +1,43 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from presidio_helpers import analyzer_engine, analyze, anonymize
         | 
| 2 | 
            +
             | 
| 3 | 
            +
             | 
| 4 | 
            +
            def test_streamlit_logic():
         | 
| 5 | 
            +
                st_model = "en"  # st_model = "StanfordAIMI/stanford-deidentifier-base"
         | 
| 6 | 
            +
                st_model_package = "stanza"  ##st_model_package = "HuggingFace"
         | 
| 7 | 
            +
                st_ta_key = None
         | 
| 8 | 
            +
                st_ta_endpoint = None
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                analyzer_params = (st_model_package, st_model, st_ta_key, st_ta_endpoint)
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                # Read default text
         | 
| 13 | 
            +
                with open("demo_text.txt") as f:
         | 
| 14 | 
            +
                    demo_text = f.readlines()
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                st_text = "".join(demo_text)
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                # instantiate and cache AnalyzerEngine
         | 
| 19 | 
            +
                analyzer_engine(*analyzer_params)
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                # Analyze
         | 
| 22 | 
            +
                st_analyze_results = analyze(
         | 
| 23 | 
            +
                    *analyzer_params,
         | 
| 24 | 
            +
                    text=st_text,
         | 
| 25 | 
            +
                    entities="All",
         | 
| 26 | 
            +
                    language="en",
         | 
| 27 | 
            +
                    score_threshold=0.35,
         | 
| 28 | 
            +
                    return_decision_process=True,
         | 
| 29 | 
            +
                    allow_list=[],
         | 
| 30 | 
            +
                    deny_list=[],
         | 
| 31 | 
            +
                )
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                # Anonymize
         | 
| 34 | 
            +
                st_anonymize_results = anonymize(
         | 
| 35 | 
            +
                    text=st_text,
         | 
| 36 | 
            +
                    operator="replace",
         | 
| 37 | 
            +
                    mask_char=None,
         | 
| 38 | 
            +
                    number_of_chars=None,
         | 
| 39 | 
            +
                    encrypt_key=None,
         | 
| 40 | 
            +
                    analyze_results=st_analyze_results,
         | 
| 41 | 
            +
                )
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                assert st_anonymize_results.text != ""
         | 
