Farnazgh commited on
Commit
b494f67
1 Parent(s): a825aee
.idea/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
.idea/Aliae_anonymizer.iml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$">
5
+ <excludeFolder url="file://$MODULE_DIR$/venv" />
6
+ </content>
7
+ <orderEntry type="jdk" jdkName="Python 3.10" jdkType="Python SDK" />
8
+ <orderEntry type="sourceFolder" forTests="false" />
9
+ </component>
10
+ </module>
.idea/inspectionProfiles/Project_Default.xml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
5
+ <option name="ignoredIdentifiers">
6
+ <list>
7
+ <option value="graphbot.graphize.GraphBot.graphize" />
8
+ </list>
9
+ </option>
10
+ </inspection_tool>
11
+ </profile>
12
+ </component>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/misc.xml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
4
+ </project>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/Aliae_anonymizer.iml" filepath="$PROJECT_DIR$/.idea/Aliae_anonymizer.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="" vcs="Git" />
5
+ </component>
6
+ </project>
__pycache__/presidio_helpers.cpython-310.pyc ADDED
Binary file (6.11 kB). View file
 
__pycache__/presidio_nlp_engine_config.cpython-310.pyc ADDED
Binary file (1.13 kB). View file
 
en_demo_text.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hello, my name is David Johnson and I live in Maine.
2
+ My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
3
+
4
+ On September 18 I visited microsoft.com and sent an email to [email protected], from the IP 192.168.0.1.
5
+
6
+ My passport: 59RF05400 and my phone number: +330788848206.
7
+
8
+ This is a valid International Bank Account Number: FR76 3000 6000 0112 3456 7890 189 or FR7630006000011234567890189 .
9
+
10
+ Kate's social security number is 269054958815780.
11
+
12
+ Pierre's nationalality is french. He was born at 01/02/1990.
13
+
14
+ His national id is 345623456789 or maybe X4RTBPFW4.
fr_demo_text.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Bonjour, je m'appelle David Johnson et j'habite dans le Maine.
2
+ Mon numéro de carte de crédit est 4095-2609-9393-4932 et mon identifiant de portefeuille crypto est 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
3
+
4
+ Le 18 septembre, j'ai visité microsoft.com et envoyé un e-mail à [email protected], à partir de l'IP 192.168.0.1.
5
+
6
+ Mon passeport : 59RF05400 et mon numéro de téléphone : +330788848206.
7
+
8
+ Il s'agit d'un numéro de compte bancaire international valide : FR76 3000 6000 0112 3456 7890 189 ou FR7630006000011234567890189.
9
+
10
+ Le numéro de sécurité sociale de Kate est le 269054958815780.
11
+
12
+ La nationalité de Pierre est française. Il est né le 01/02/1990.
13
+
14
+ Son identifiant national est 345623456789 ou peut-être X4RTBPFW4.
logo.png ADDED
presidio_helpers.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Helper methods for the Presidio Streamlit app
3
+ """
4
+ from typing import List, Optional, Tuple
5
+ import logging
6
+ import streamlit as st
7
+ from presidio_analyzer import (
8
+ AnalyzerEngine,
9
+ RecognizerResult,
10
+ RecognizerRegistry,
11
+ PatternRecognizer,
12
+ Pattern,
13
+ )
14
+ from presidio_analyzer.nlp_engine import NlpEngine
15
+ from presidio_anonymizer import AnonymizerEngine
16
+ from presidio_anonymizer.entities import OperatorConfig
17
+
18
+ # from openai_fake_data_generator import (
19
+ # set_openai_params,
20
+ # call_completion_model,
21
+ # create_prompt,
22
+ # OpenAIParams,
23
+ # )
24
+ from presidio_nlp_engine_config import (
25
+ create_nlp_engine_with_spacy,
26
+ # create_nlp_engine_with_flair,
27
+ # create_nlp_engine_with_transformers,
28
+ # create_nlp_engine_with_azure_text_analytics,
29
+ )
30
+
31
+ logger = logging.getLogger("presidio-streamlit")
32
+
33
+
34
+ @st.cache_resource
35
+ def nlp_engine_and_registry(
36
+ model_family: str,
37
+ model_path: str,
38
+ ta_key: Optional[str] = None,
39
+ ta_endpoint: Optional[str] = None,
40
+ ) -> Tuple[NlpEngine, RecognizerRegistry]:
41
+ """Create the NLP Engine instance based on the requested model.
42
+ :param model_family: Which model package to use for NER.
43
+ :param model_path: Which model to use for NER. E.g.,
44
+ "StanfordAIMI/stanford-deidentifier-base",
45
+ "obi/deid_roberta_i2b2",
46
+ "en_core_web_lg"
47
+ :param ta_key: Key to the Text Analytics endpoint (only if model_path = "Azure Text Analytics")
48
+ :param ta_endpoint: Endpoint of the Text Analytics instance (only if model_path = "Azure Text Analytics")
49
+ """
50
+
51
+ # Set up NLP Engine according to the model of choice
52
+ if "spaCy" in model_family:
53
+ return create_nlp_engine_with_spacy(model_path)
54
+ # elif "flair" in model_family:
55
+ # return create_nlp_engine_with_flair(model_path)
56
+ elif "HuggingFace" in model_family:
57
+ return create_nlp_engine_with_transformers(model_path)
58
+ # elif "Azure Text Analytics" in model_family:
59
+ # return create_nlp_engine_with_azure_text_analytics(ta_key, ta_endpoint)
60
+ # else:
61
+ # raise ValueError(f"Model family {model_family} not supported")
62
+
63
+
64
+ @st.cache_resource
65
+ def analyzer_engine(
66
+ model_family: str,
67
+ model_path: str,
68
+ ta_key: Optional[str] = None,
69
+ ta_endpoint: Optional[str] = None,
70
+ ) -> AnalyzerEngine:
71
+ """Create the NLP Engine instance based on the requested model.
72
+ :param model_family: Which model package to use for NER.
73
+ :param model_path: Which model to use for NER:
74
+ "StanfordAIMI/stanford-deidentifier-base",
75
+ "obi/deid_roberta_i2b2",
76
+ "en_core_web_lg"
77
+ :param ta_key: Key to the Text Analytics endpoint (only if model_path = "Azure Text Analytics")
78
+ :param ta_endpoint: Endpoint of the Text Analytics instance (only if model_path = "Azure Text Analytics")
79
+ """
80
+ nlp_engine, registry = nlp_engine_and_registry(
81
+ model_family, model_path, ta_key, ta_endpoint
82
+ )
83
+ analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry, supported_languages=['fr', 'en'])
84
+ return analyzer
85
+
86
+
87
+ @st.cache_resource
88
+ def anonymizer_engine():
89
+ """Return AnonymizerEngine."""
90
+ return AnonymizerEngine()
91
+
92
+
93
+ @st.cache_data
94
+ def get_supported_entities(
95
+ model_family: str, model_path: str, ta_key: str, ta_endpoint: str
96
+ ):
97
+ """Return supported entities from the Analyzer Engine."""
98
+ # return analyzer_engine(
99
+ # model_family, model_path, ta_key, ta_endpoint
100
+ # ).get_supported_entities() + ["GENERIC_PII"]
101
+ return ["PERSON", "IBAN_CODE", "PHONE_NUMBER", "CREDIT_CARD", "CRYPTO", "DATE_TIME", "EMAIL_ADDRESS", "IP_ADDRESS", "NRP", "LOCATION", "URL", "FRENCH_SSN", "FRENCH_PASS", "FRENCH_NID"]
102
+
103
+
104
+ @st.cache_data
105
+ def analyze(
106
+ model_family: str, model_path: str, ta_key: str, ta_endpoint: str, **kwargs
107
+ ):
108
+ """Analyze input using Analyzer engine and input arguments (kwargs)."""
109
+ if "entities" not in kwargs or "All" in kwargs["entities"]:
110
+ kwargs["entities"] = None
111
+
112
+ if "deny_list" in kwargs and kwargs["deny_list"] is not None:
113
+ ad_hoc_recognizer = create_ad_hoc_deny_list_recognizer(kwargs["deny_list"])
114
+ kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else []
115
+ del kwargs["deny_list"]
116
+
117
+ if "regex_params" in kwargs and len(kwargs["regex_params"]) > 0:
118
+ ad_hoc_recognizer = create_ad_hoc_regex_recognizer(*kwargs["regex_params"])
119
+ kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else []
120
+ del kwargs["regex_params"]
121
+
122
+ return analyzer_engine(model_family, model_path, ta_key, ta_endpoint).analyze(
123
+ **kwargs
124
+ )
125
+
126
+
127
+ def anonymize(
128
+ text: str,
129
+ operator: str,
130
+ analyze_results: List[RecognizerResult],
131
+ mask_char: Optional[str] = None,
132
+ number_of_chars: Optional[str] = None,
133
+ encrypt_key: Optional[str] = None,
134
+ ):
135
+ """Anonymize identified input using Presidio Anonymizer.
136
+
137
+ :param text: Full text
138
+ :param operator: Operator name
139
+ :param mask_char: Mask char (for mask operator)
140
+ :param number_of_chars: Number of characters to mask (for mask operator)
141
+ :param encrypt_key: Encryption key (for encrypt operator)
142
+ :param analyze_results: list of results from presidio analyzer engine
143
+ """
144
+
145
+ if operator == "mask":
146
+ operator_config = {
147
+ "type": "mask",
148
+ "masking_char": mask_char,
149
+ "chars_to_mask": number_of_chars,
150
+ "from_end": False,
151
+ }
152
+
153
+ # Define operator config
154
+ elif operator == "encrypt":
155
+ operator_config = {"key": encrypt_key}
156
+ elif operator == "highlight":
157
+ operator_config = {"lambda": lambda x: x}
158
+ else:
159
+ operator_config = None
160
+
161
+ # Change operator if needed as intermediate step
162
+ if operator == "highlight":
163
+ operator = "custom"
164
+ elif operator == "synthesize":
165
+ operator = "replace"
166
+ else:
167
+ operator = operator
168
+
169
+ res = anonymizer_engine().anonymize(
170
+ text,
171
+ analyze_results,
172
+ operators={"DEFAULT": OperatorConfig(operator, operator_config)},
173
+ )
174
+ return res
175
+
176
+
177
+ def annotate(text: str, analyze_results: List[RecognizerResult]):
178
+ """Highlight the identified PII entities on the original text
179
+
180
+ :param text: Full text
181
+ :param analyze_results: list of results from presidio analyzer engine
182
+ """
183
+ tokens = []
184
+
185
+ # Use the anonymizer to resolve overlaps
186
+ results = anonymize(
187
+ text=text,
188
+ operator="highlight",
189
+ analyze_results=analyze_results,
190
+ )
191
+
192
+ # sort by start index
193
+ results = sorted(results.items, key=lambda x: x.start)
194
+ for i, res in enumerate(results):
195
+ if i == 0:
196
+ tokens.append(text[: res.start])
197
+
198
+ # append entity text and entity type
199
+ tokens.append((text[res.start : res.end], res.entity_type))
200
+
201
+ # if another entity coming i.e. we're not at the last results element, add text up to next entity
202
+ if i != len(results) - 1:
203
+ tokens.append(text[res.end : results[i + 1].start])
204
+ # if no more entities coming, add all remaining text
205
+ else:
206
+ tokens.append(text[res.end :])
207
+ return tokens
208
+
209
+
210
+ # def create_fake_data(
211
+ # text: str,
212
+ # analyze_results: List[RecognizerResult],
213
+ # openai_params: OpenAIParams,
214
+ # ):
215
+ # """Creates a synthetic version of the text using OpenAI APIs"""
216
+ # if not openai_params.openai_key:
217
+ # return "Please provide your OpenAI key"
218
+ # results = anonymize(text=text, operator="replace", analyze_results=analyze_results)
219
+ # set_openai_params(openai_params)
220
+ # prompt = create_prompt(results.text)
221
+ # print(f"Prompt: {prompt}")
222
+ # fake = call_openai_api(
223
+ # prompt=prompt,
224
+ # openai_model_name=openai_params.model,
225
+ # openai_deployment_name=openai_params.deployment_name,
226
+ # )
227
+ # return fake
228
+
229
+
230
+ # @st.cache_data
231
+ # def call_openai_api(
232
+ # prompt: str, openai_model_name: str, openai_deployment_name: Optional[str] = None
233
+ # ) -> str:
234
+ # fake_data = call_completion_model(
235
+ # prompt, model=openai_model_name, deployment_id=openai_deployment_name
236
+ # )
237
+ # return fake_data
238
+
239
+
240
+ def create_ad_hoc_deny_list_recognizer(
241
+ deny_list=Optional[List[str]],
242
+ ) -> Optional[PatternRecognizer]:
243
+ if not deny_list:
244
+ return None
245
+
246
+ deny_list_recognizer = PatternRecognizer(
247
+ supported_entity="GENERIC_PII", deny_list=deny_list
248
+ )
249
+ return deny_list_recognizer
250
+
251
+
252
+ def create_ad_hoc_regex_recognizer(
253
+ regex: str, entity_type: str, score: float, context: Optional[List[str]] = None
254
+ ) -> Optional[PatternRecognizer]:
255
+ if not regex:
256
+ return None
257
+ pattern = Pattern(name="Regex pattern", regex=regex, score=score)
258
+ regex_recognizer = PatternRecognizer(
259
+ supported_entity=entity_type, patterns=[pattern], context=context
260
+ )
261
+ return regex_recognizer
presidio_nlp_engine_config.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple
2
+ import logging
3
+ import spacy
4
+ from presidio_analyzer import RecognizerRegistry
5
+ from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
6
+
7
+ logger = logging.getLogger("presidio-streamlit")
8
+
9
+
10
+ def create_nlp_engine_with_spacy(
11
+ model_path: str,
12
+ ) -> Tuple[NlpEngine, RecognizerRegistry]:
13
+ """
14
+ Instantiate an NlpEngine with a spaCy model
15
+ :param model_path: spaCy model path.
16
+ """
17
+ if not spacy.util.is_package(model_path):
18
+ spacy.cli.download(model_path)
19
+
20
+ nlp_configuration = {
21
+ "nlp_engine_name": "spacy",
22
+ "models": [{"lang_code": model_path.split('_')[0], "model_name": model_path}],
23
+ }
24
+
25
+ nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
26
+
27
+ registry = RecognizerRegistry()
28
+ # registry.load_predefined_recognizers()
29
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine, languages=["fr", "en"])
30
+ registry.add_recognizers_from_yaml("recognizers.yaml")
31
+
32
+
33
+
34
+ return nlp_engine, registry
35
+
36
+
37
+ # def create_nlp_engine_with_transformers(
38
+ # model_path: str,
39
+ # ) -> Tuple[NlpEngine, RecognizerRegistry]:
40
+ # """
41
+ # Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model.
42
+ # The TransformersRecognizer would return results from Transformers models, the spaCy model
43
+ # would return NlpArtifacts such as POS and lemmas.
44
+ # :param model_path: HuggingFace model path.
45
+ # """
46
+ #
47
+ # from transformers_rec import (
48
+ # STANFORD_COFIGURATION,
49
+ # BERT_DEID_CONFIGURATION,
50
+ # TransformersRecognizer,
51
+ # )
52
+ #
53
+ # registry = RecognizerRegistry()
54
+ # registry.load_predefined_recognizers()
55
+ #
56
+ # if not spacy.util.is_package("en_core_web_sm"):
57
+ # spacy.cli.download("en_core_web_sm")
58
+ # # Using a small spaCy model + a HF NER model
59
+ # transformers_recognizer = TransformersRecognizer(model_path=model_path)
60
+ #
61
+ # if model_path == "StanfordAIMI/stanford-deidentifier-base":
62
+ # transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
63
+ # elif model_path == "obi/deid_roberta_i2b2":
64
+ # transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
65
+ # else:
66
+ # print(f"Warning: Model has no configuration, loading default.")
67
+ # transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
68
+ #
69
+ # # Use small spaCy model, no need for both spacy and HF models
70
+ # # The transformers model is used here as a recognizer, not as an NlpEngine
71
+ # nlp_configuration = {
72
+ # "nlp_engine_name": "spacy",
73
+ # "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
74
+ # }
75
+ #
76
+ # registry.add_recognizer(transformers_recognizer)
77
+ # registry.remove_recognizer("SpacyRecognizer")
78
+ #
79
+ # nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
80
+ #
81
+ # return nlp_engine, registry
82
+
83
+
84
+ # def create_nlp_engine_with_flair(
85
+ # model_path: str,
86
+ # ) -> Tuple[NlpEngine, RecognizerRegistry]:
87
+ # """
88
+ # Instantiate an NlpEngine with a FlairRecognizer and a small spaCy model.
89
+ # The FlairRecognizer would return results from Flair models, the spaCy model
90
+ # would return NlpArtifacts such as POS and lemmas.
91
+ # :param model_path: Flair model path.
92
+ # """
93
+ # from flair_recognizer import FlairRecognizer
94
+ #
95
+ # registry = RecognizerRegistry()
96
+ # registry.load_predefined_recognizers()
97
+ #
98
+ # if not spacy.util.is_package("en_core_web_sm"):
99
+ # spacy.cli.download("en_core_web_sm")
100
+ # # Using a small spaCy model + a Flair NER model
101
+ # flair_recognizer = FlairRecognizer(model_path=model_path)
102
+ # nlp_configuration = {
103
+ # "nlp_engine_name": "spacy",
104
+ # "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
105
+ # }
106
+ # registry.add_recognizer(flair_recognizer)
107
+ # registry.remove_recognizer("SpacyRecognizer")
108
+ #
109
+ # nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
110
+ #
111
+ # return nlp_engine, registry
112
+
113
+
114
+ # def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
115
+ # """
116
+ # Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
117
+ # The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model
118
+ # would return NlpArtifacts such as POS and lemmas.
119
+ # :param ta_key: Azure Text Analytics key.
120
+ # :param ta_endpoint: Azure Text Analytics endpoint.
121
+ # """
122
+ # from text_analytics_wrapper import TextAnalyticsWrapper
123
+ #
124
+ # if not ta_key or not ta_endpoint:
125
+ # raise RuntimeError("Please fill in the Text Analytics endpoint details")
126
+ #
127
+ # registry = RecognizerRegistry()
128
+ # registry.load_predefined_recognizers()
129
+ #
130
+ # ta_recognizer = TextAnalyticsWrapper(ta_endpoint=ta_endpoint, ta_key=ta_key)
131
+ # nlp_configuration = {
132
+ # "nlp_engine_name": "spacy",
133
+ # "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
134
+ # }
135
+ #
136
+ # nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
137
+ #
138
+ # registry.add_recognizer(ta_recognizer)
139
+ # registry.remove_recognizer("SpacyRecognizer")
140
+ #
141
+ # return nlp_engine, registry
presidio_streamlit.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Streamlit app for Presidio."""
2
+ import logging
3
+ import os
4
+ import traceback
5
+
6
+ import dotenv
7
+ import pandas as pd
8
+ import streamlit as st
9
+ import streamlit.components.v1 as components
10
+ from annotated_text import annotated_text
11
+ from streamlit_tags import st_tags
12
+
13
+ # from openai_fake_data_generator import OpenAIParams
14
+ from presidio_helpers import (
15
+ get_supported_entities,
16
+ analyze,
17
+ anonymize,
18
+ annotate,
19
+ # create_fake_data,
20
+ analyzer_engine,
21
+ )
22
+
23
+ st.set_page_config(
24
+ page_title="Presidio demo",
25
+ layout="wide",
26
+ initial_sidebar_state="expanded",
27
+ # menu_items={
28
+ # "About": "https://microsoft.github.io/presidio/",
29
+ # },
30
+ )
31
+
32
+ dotenv.load_dotenv()
33
+ logger = logging.getLogger("presidio-streamlit")
34
+
35
+
36
+ allow_other_models = os.getenv("ALLOW_OTHER_MODELS", False)
37
+
38
+
39
+ # Sidebar
40
+ st.sidebar.header(
41
+ """
42
+ Personal Info Anonymization
43
+ """
44
+ )
45
+
46
+ # set aliae logo
47
+ st.sidebar.image('logo.png', use_column_width=True)
48
+
49
+
50
+ model_help_text = """
51
+ Select which Named Entity Recognition (NER) model to use for PII detection, in parallel to rule-based recognizers.
52
+ Presidio supports multiple NER packages off-the-shelf, such as spaCy, Huggingface, Stanza and Flair,
53
+ as well as service such as Azure Text Analytics PII.
54
+ """
55
+ st_ta_key = st_ta_endpoint = ""
56
+
57
+ model_list = [
58
+ "spaCy/en_core_web_lg",
59
+ "spaCy/fr_core_news_md",
60
+ ]
61
+ # "flair/ner-english-large",
62
+ #
63
+ # "HuggingFace/StanfordAIMI/stanford-deidentifier-base",
64
+ # "Azure Text Analytics PII",
65
+ # "Other",
66
+
67
+
68
+ # if not allow_other_models:
69
+ # model_list.pop()
70
+
71
+
72
+ # Select model
73
+ lang = st.sidebar.selectbox(
74
+ "Language",
75
+ ['en','fr'],
76
+ index=0,
77
+ )
78
+
79
+ # Extract model package.
80
+ # st_model_package = st_model.split("/")[0]
81
+ st_model_package = 'spaCy'
82
+
83
+ # # Remove package prefix (if needed)
84
+ # st_model = (
85
+ # st_model
86
+ # if st_model_package not in ("spaCy", "HuggingFace")
87
+ # else "/".join(st_model.split("/")[1:])
88
+ # )
89
+ st_model = 'en_core_web_lg'
90
+ if lang =='en': st_model = 'en_core_web_lg'
91
+ elif lang == 'fr' : st_model = 'fr_core_news_md'
92
+
93
+ # if st_model == "Other":
94
+ # st_model_package = st.sidebar.selectbox(
95
+ # "NER model OSS package", options=["spaCy", "Flair", "HuggingFace"]
96
+ # )
97
+ # st_model = st.sidebar.text_input(f"NER model name", value="")
98
+
99
+ # if st_model == "Azure Text Analytics PII":
100
+ # st_ta_key = st.sidebar.text_input(
101
+ # f"Text Analytics key", value=os.getenv("TA_KEY", ""), type="password"
102
+ # )
103
+ # st_ta_endpoint = st.sidebar.text_input(
104
+ # f"Text Analytics endpoint",
105
+ # value=os.getenv("TA_ENDPOINT", default=""),
106
+ # help="For more info: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/overview", # noqa: E501
107
+ # )
108
+
109
+
110
+ # st.sidebar.warning("Note: Models might take some time to download. ")
111
+
112
+ analyzer_params = (st_model_package, st_model, st_ta_key, st_ta_endpoint)
113
+ logger.debug(f"analyzer_params: {analyzer_params}")
114
+
115
+ st_operator = st.sidebar.selectbox(
116
+ "De-identification approach",
117
+ ["redact", "replace", "highlight"],
118
+ index=2,
119
+ help="""
120
+ Select which manipulation to the text is requested after PII has been identified.\n
121
+ - Redact: Completely remove the PII text\n
122
+ - Replace: Replace the PII text with a constant, e.g. <PERSON>\n
123
+ - Highlight: Shows the original text with PII highlighted in colors\n
124
+ """,
125
+ )
126
+ st_mask_char = "*"
127
+ st_number_of_chars = 15
128
+ st_encrypt_key = "WmZq4t7w!z%C&F)J"
129
+
130
+ open_ai_params = None
131
+
132
+ logger.debug(f"st_operator: {st_operator}")
133
+
134
+ # if st_operator == "mask":
135
+ # st_number_of_chars = st.sidebar.number_input(
136
+ # "number of chars", value=st_number_of_chars, min_value=0, max_value=100
137
+ # )
138
+ # st_mask_char = st.sidebar.text_input(
139
+ # "Mask character", value=st_mask_char, max_chars=1
140
+ # )
141
+ # elif st_operator == "encrypt":
142
+ # st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
143
+ # elif st_operator == "synthesize":
144
+ # if os.getenv("OPENAI_TYPE", default="openai") == "Azure":
145
+ # openai_api_type = "azure"
146
+ # st_openai_api_base = st.sidebar.text_input(
147
+ # "Azure OpenAI base URL",
148
+ # value=os.getenv("AZURE_OPENAI_ENDPOINT", default=""),
149
+ # )
150
+ # st_deployment_name = st.sidebar.text_input(
151
+ # "Deployment name", value=os.getenv("AZURE_OPENAI_DEPLOYMENT", default="")
152
+ # )
153
+ # st_openai_version = st.sidebar.text_input(
154
+ # "OpenAI version",
155
+ # value=os.getenv("OPENAI_API_VERSION", default="2023-05-15"),
156
+ # )
157
+ # else:
158
+ # st_openai_version = openai_api_type = st_openai_api_base = None
159
+ # st_deployment_name = ""
160
+ # st_openai_key = st.sidebar.text_input(
161
+ # "OPENAI_KEY",
162
+ # value=os.getenv("OPENAI_KEY", default=""),
163
+ # help="See https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key for more info.",
164
+ # type="password",
165
+ # )
166
+ # st_openai_model = st.sidebar.text_input(
167
+ # "OpenAI model for text synthesis",
168
+ # value=os.getenv("OPENAI_MODEL", default="text-davinci-003"),
169
+ # help="See more here: https://platform.openai.com/docs/models/",
170
+ # )
171
+ #
172
+ # open_ai_params = OpenAIParams(
173
+ # openai_key=st_openai_key,
174
+ # model=st_openai_model,
175
+ # api_base=st_openai_api_base,
176
+ # deployment_name=st_deployment_name,
177
+ # api_version=st_openai_version,
178
+ # api_type=openai_api_type,
179
+ # )
180
+
181
+ # st_threshold = st.sidebar.slider(
182
+ # label="Acceptance threshold",
183
+ # min_value=0.0,
184
+ # max_value=1.0,
185
+ # value=0.35,
186
+ # help="Define the threshold for accepting a detection as PII. See more here: ",
187
+ # )
188
+ st_threshold = 0.35
189
+ #
190
+ # st_return_decision_process = st.sidebar.checkbox(
191
+ # "Add analysis explanations to findings",
192
+ # value=False,
193
+ # help="Add the decision process to the output table. "
194
+ # "More information can be found here: https://microsoft.github.io/presidio/analyzer/decision_process/",
195
+ # )
196
+ st_return_decision_process = False
197
+
198
+ # # Allow and deny lists
199
+ # st_deny_allow_expander = st.sidebar.expander(
200
+ # "Allowlists and denylists",
201
+ # expanded=False,
202
+ # )
203
+ #
204
+ # with st_deny_allow_expander:
205
+ # st_allow_list = st_tags(
206
+ # label="Add words to the allowlist", text="Enter word and press enter."
207
+ # )
208
+ # st.caption(
209
+ # "Allowlists contain words that are not considered PII, but are detected as such."
210
+ # )
211
+ #
212
+ # st_deny_list = st_tags(
213
+ # label="Add words to the denylist", text="Enter word and press enter."
214
+ # )
215
+ # st.caption(
216
+ # "Denylists contain words that are considered PII, but are not detected as such."
217
+ # )
218
+ st_allow_list = []
219
+ st_deny_list = []
220
+ # Main panel
221
+
222
+ with st.expander("About Microsoft Presidio", expanded=False):
223
+ st.info(
224
+ """Presidio is an open source customizable framework for PII detection and de-identification."""
225
+ )
226
+
227
+ analyzer_load_state = st.info("Starting Presidio analyzer...")
228
+
229
+ analyzer_load_state.empty()
230
+
231
+ # Read default text
232
+ with open("en_demo_text.txt") as f:
233
+ en_demo_text = f.readlines()
234
+ with open("fr_demo_text.txt") as f:
235
+ fr_demo_text = f.readlines()
236
+
237
+ if lang == 'en': demo_text = en_demo_text
238
+ elif lang == 'fr': demo_text = fr_demo_text
239
+
240
+ # Create two columns for before and after
241
+ col1, col2 = st.columns(2)
242
+
243
+ # Before:
244
+ col1.subheader("Input")
245
+ st_text = col1.text_area(
246
+ label="Enter text", value="".join(demo_text), height=400, key="text_input"
247
+ )
248
+
249
+ try:
250
+ # Choose entities
251
+ st_entities_expander = st.sidebar.expander("Choose entities to look for")
252
+ st_entities = st_entities_expander.multiselect(
253
+ label="Which entities to look for?",
254
+ options=get_supported_entities(*analyzer_params),
255
+ default=list(get_supported_entities(*analyzer_params)),
256
+ help="Limit the list of PII entities detected. "
257
+ "This list is dynamic and based on the NER model and registered recognizers. "
258
+ "More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/",
259
+ )
260
+
261
+ # Before
262
+ analyzer_load_state = st.info("Starting Presidio analyzer...")
263
+ analyzer = analyzer_engine(*analyzer_params)
264
+ analyzer_load_state.empty()
265
+
266
+ st_analyze_results = analyze(
267
+ *analyzer_params,
268
+ text=st_text,
269
+ entities=st_entities,
270
+ language=lang,
271
+ score_threshold=st_threshold,
272
+ return_decision_process=st_return_decision_process,
273
+ allow_list=st_allow_list,
274
+ deny_list=st_deny_list,
275
+ )
276
+
277
+ # After
278
+ if st_operator not in ("highlight", "synthesize"):
279
+ with col2:
280
+ st.subheader(f"Output")
281
+ st_anonymize_results = anonymize(
282
+ text=st_text,
283
+ operator=st_operator,
284
+ mask_char=st_mask_char,
285
+ number_of_chars=st_number_of_chars,
286
+ encrypt_key=st_encrypt_key,
287
+ analyze_results=st_analyze_results,
288
+ )
289
+ st.text_area(
290
+ label="De-identified", value=st_anonymize_results.text, height=400
291
+ )
292
+ # elif st_operator == "synthesize":
293
+ # with col2:
294
+ # st.subheader(f"OpenAI Generated output")
295
+ # fake_data = create_fake_data(
296
+ # st_text,
297
+ # st_analyze_results,
298
+ # open_ai_params,
299
+ # )
300
+ # st.text_area(label="Synthetic data", value=fake_data, height=400)
301
+ else:
302
+ st.subheader("Highlighted")
303
+ annotated_tokens = annotate(text=st_text, analyze_results=st_analyze_results)
304
+ # annotated_tokens
305
+ annotated_text(*annotated_tokens)
306
+
307
+ # table result
308
+ st.subheader(
309
+ "Findings"
310
+ if not st_return_decision_process
311
+ else "Findings with decision factors"
312
+ )
313
+ if st_analyze_results:
314
+ df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
315
+ df["text"] = [st_text[res.start : res.end] for res in st_analyze_results]
316
+
317
+ df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
318
+ {
319
+ "entity_type": "Entity type",
320
+ "text": "Text",
321
+ "start": "Start",
322
+ "end": "End",
323
+ "score": "Confidence",
324
+ },
325
+ axis=1,
326
+ )
327
+ df_subset["Text"] = [st_text[res.start : res.end] for res in st_analyze_results]
328
+ if st_return_decision_process:
329
+ analysis_explanation_df = pd.DataFrame.from_records(
330
+ [r.analysis_explanation.to_dict() for r in st_analyze_results]
331
+ )
332
+ df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
333
+ st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
334
+ else:
335
+ st.text("No findings")
336
+
337
+ except Exception as e:
338
+ print(e)
339
+ traceback.print_exc()
340
+ st.error(e)
341
+
342
+ components.html(
343
+ """
344
+ <script type="text/javascript">
345
+ (function(c,l,a,r,i,t,y){
346
+ c[a]=c[a]||function(){(c[a].q=c[a].q||[]).push(arguments)};
347
+ t=l.createElement(r);t.async=1;t.src="https://www.clarity.ms/tag/"+i;
348
+ y=l.getElementsByTagName(r)[0];y.parentNode.insertBefore(t,y);
349
+ })(window, document, "clarity", "script", "h7f8bp42n8");
350
+ </script>
351
+ """
352
+ )
recognizers.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ recognizers:
2
+ -
3
+ name: "FRENCH_NID"
4
+ supported_language: "fr"
5
+ patterns:
6
+ -
7
+ name: "FRENCH_NID"
8
+ regex: "[0-9]{12}|([A-Z]|[0-9]){9}"
9
+ score: 0.5
10
+ context:
11
+ - national
12
+ supported_entity: "FRENCH_NID"
13
+ -
14
+ name: "FRENCH_NID"
15
+ supported_language: "en"
16
+ patterns:
17
+ -
18
+ name: "FRENCH_NID"
19
+ regex: "[0-9]{12}|([A-Z]|[0-9]){9}"
20
+ score: 0.5
21
+ context:
22
+ - national
23
+ supported_entity: "FRENCH_NID"
24
+ -
25
+ name: "FRENCH_PASS"
26
+ supported_language: "fr"
27
+ patterns:
28
+ -
29
+ name: "FRENCH_PASS"
30
+ regex: "[0-9]{2}([a-z]|[A-Z]){2}[0-9]{5}"
31
+ score: 0.5
32
+ context:
33
+ - passeport
34
+ supported_entity: "FRENCH_PASS"
35
+ -
36
+ name: "FRENCH_PASS"
37
+ supported_language: "en"
38
+ patterns:
39
+ -
40
+ name: "FRENCH_PASS"
41
+ regex: "[0-9]{2}([a-z]|[A-Z]){2}[0-9]{5}"
42
+ score: 0.5
43
+ context:
44
+ - passport
45
+ supported_entity: "FRENCH_PASS"
46
+ -
47
+ name: "FRENCH_SSN"
48
+ supported_language: "fr"
49
+ patterns:
50
+ -
51
+ name: "FRENCH_SSN"
52
+ regex: "[0-9]{15}"
53
+ score: 0.5
54
+ context:
55
+ - sécurité sociale
56
+ - social
57
+ supported_entity: "FRENCH_SSN"
58
+ -
59
+ name: "FRENCH_SSN"
60
+ supported_language: "en"
61
+ patterns:
62
+ -
63
+ name: "FRENCH_SSN"
64
+ regex: "[0-9]{15}"
65
+ score: 0.5
66
+ context:
67
+ - social security
68
+ - social
69
+ supported_entity: "FRENCH_SSN"
70
+ # -
71
+ # name: "CREDIT_CARD"
72
+ # supported_language: "fr"
73
+ # context:
74
+ # - crédit
75
+ # - carte
76
+ # - carte de crédit
77
+ # supported_entity: "CREDIT_CARD"
78
+ # deny_list:
79
+ # - carte
80
+ # -
81
+ # name: "DATE_TIME"
82
+ # supported_language: "fr"
83
+ # context:
84
+ # - mois
85
+ # - date
86
+ # - jour
87
+ # - année
88
+ # supported_entity: "DATE_TIME"
89
+ # deny_list:
90
+ # - mois
91
+ # -
92
+ # name: "PHONE_NUMBER"
93
+ # supported_language: "fr"
94
+ # context:
95
+ # - téléphone
96
+ # supported_entity: "PHONE_NUMBER"
97
+ # deny_list:
98
+ # - téléphone
99
+
100
+
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ presidio-analyzer
2
+ presidio-anonymizer
3
+ streamlit
4
+ streamlit-tags
5
+ pandas
6
+ python-dotenv
7
+ st-annotated-text
8
+ torch
9
+ transformers
10
+ flair
11
+ openai
12
+ spacy
13
+ azure-ai-textanalytics