Spaces:

AliaeAI
/

Anonymizer_demo

Sleeping

App Files Files Community

Farnazgh commited on Jul 24, 2023

Commit

b494f67

•

1 Parent(s): a825aee

add files

Browse files

Files changed (17) hide show

.idea/.gitignore +3 -0
.idea/Aliae_anonymizer.iml +10 -0
.idea/inspectionProfiles/Project_Default.xml +12 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +4 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
__pycache__/presidio_helpers.cpython-310.pyc +0 -0
__pycache__/presidio_nlp_engine_config.cpython-310.pyc +0 -0
en_demo_text.txt +14 -0
fr_demo_text.txt +14 -0
logo.png +0 -0
presidio_helpers.py +261 -0
presidio_nlp_engine_config.py +141 -0
presidio_streamlit.py +352 -0
recognizers.yaml +100 -0
requirements.txt +13 -0

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# Default ignored files
+/shelf/
+/workspace.xml

.idea/Aliae_anonymizer.iml ADDED Viewed

	@@ -0,0 +1,10 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
+    </content>
+    <orderEntry type="jdk" jdkName="Python 3.10" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,12 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredIdentifiers">
+        <list>
+          <option value="graphbot.graphize.GraphBot.graphize" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,4 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/Aliae_anonymizer.iml" filepath="$PROJECT_DIR$/.idea/Aliae_anonymizer.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>

__pycache__/presidio_helpers.cpython-310.pyc ADDED Viewed

Binary file (6.11 kB). View file

__pycache__/presidio_nlp_engine_config.cpython-310.pyc ADDED Viewed

Binary file (1.13 kB). View file

en_demo_text.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+Hello, my name is David Johnson and I live in Maine.
+My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
+On September 18 I visited microsoft.com and sent an email to [email protected],  from the IP 192.168.0.1.
+My passport: 59RF05400 and my phone number: +330788848206.
+This is a valid International Bank Account Number: FR76 3000 6000 0112 3456 7890 189 or FR7630006000011234567890189 .
+Kate's social security number is 269054958815780.
+Pierre's nationalality is french. He was born at 01/02/1990.
+His national id is 345623456789 or maybe X4RTBPFW4.

fr_demo_text.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+Bonjour, je m'appelle David Johnson et j'habite dans le Maine.
+Mon numéro de carte de crédit est 4095-2609-9393-4932 et mon identifiant de portefeuille crypto est 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
+Le 18 septembre, j'ai visité microsoft.com et envoyé un e-mail à [email protected], à partir de l'IP 192.168.0.1.
+Mon passeport : 59RF05400 et mon numéro de téléphone : +330788848206.
+Il s'agit d'un numéro de compte bancaire international valide : FR76 3000 6000 0112 3456 7890 189 ou FR7630006000011234567890189.
+Le numéro de sécurité sociale de Kate est le 269054958815780.
+La nationalité de Pierre est française. Il est né le 01/02/1990.
+Son identifiant national est 345623456789 ou peut-être X4RTBPFW4.

logo.png ADDED Viewed

presidio_helpers.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""
+Helper methods for the Presidio Streamlit app
+"""
+from typing import List, Optional, Tuple
+import logging
+import streamlit as st
+from presidio_analyzer import (
+    AnalyzerEngine,
+    RecognizerResult,
+    RecognizerRegistry,
+    PatternRecognizer,
+    Pattern,
+)
+from presidio_analyzer.nlp_engine import NlpEngine
+from presidio_anonymizer import AnonymizerEngine
+from presidio_anonymizer.entities import OperatorConfig
+# from openai_fake_data_generator import (
+#     set_openai_params,
+#     call_completion_model,
+#     create_prompt,
+#     OpenAIParams,
+# )
+from presidio_nlp_engine_config import (
+    create_nlp_engine_with_spacy,
+    # create_nlp_engine_with_flair,
+    # create_nlp_engine_with_transformers,
+    # create_nlp_engine_with_azure_text_analytics,
+)
+logger = logging.getLogger("presidio-streamlit")
+@st.cache_resource
+def nlp_engine_and_registry(
+    model_family: str,
+    model_path: str,
+    ta_key: Optional[str] = None,
+    ta_endpoint: Optional[str] = None,
+) -> Tuple[NlpEngine, RecognizerRegistry]:
+    """Create the NLP Engine instance based on the requested model.
+    :param model_family: Which model package to use for NER.
+    :param model_path: Which model to use for NER. E.g.,
+        "StanfordAIMI/stanford-deidentifier-base",
+        "obi/deid_roberta_i2b2",
+        "en_core_web_lg"
+    :param ta_key: Key to the Text Analytics endpoint (only if model_path = "Azure Text Analytics")
+    :param ta_endpoint: Endpoint of the Text Analytics instance (only if model_path = "Azure Text Analytics")
+    """
+    # Set up NLP Engine according to the model of choice
+    if "spaCy" in model_family:
+        return create_nlp_engine_with_spacy(model_path)
+    # elif "flair" in model_family:
+    #     return create_nlp_engine_with_flair(model_path)
+    elif "HuggingFace" in model_family:
+        return create_nlp_engine_with_transformers(model_path)
+    # elif "Azure Text Analytics" in model_family:
+    #     return create_nlp_engine_with_azure_text_analytics(ta_key, ta_endpoint)
+    # else:
+    #     raise ValueError(f"Model family {model_family} not supported")
+@st.cache_resource
+def analyzer_engine(
+    model_family: str,
+    model_path: str,
+    ta_key: Optional[str] = None,
+    ta_endpoint: Optional[str] = None,
+) -> AnalyzerEngine:
+    """Create the NLP Engine instance based on the requested model.
+    :param model_family: Which model package to use for NER.
+    :param model_path: Which model to use for NER:
+        "StanfordAIMI/stanford-deidentifier-base",
+        "obi/deid_roberta_i2b2",
+        "en_core_web_lg"
+    :param ta_key: Key to the Text Analytics endpoint (only if model_path = "Azure Text Analytics")
+    :param ta_endpoint: Endpoint of the Text Analytics instance (only if model_path = "Azure Text Analytics")
+    """
+    nlp_engine, registry = nlp_engine_and_registry(
+        model_family, model_path, ta_key, ta_endpoint
+    )
+    analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry, supported_languages=['fr', 'en'])
+    return analyzer
+@st.cache_resource
+def anonymizer_engine():
+    """Return AnonymizerEngine."""
+    return AnonymizerEngine()
+@st.cache_data
+def get_supported_entities(
+    model_family: str, model_path: str, ta_key: str, ta_endpoint: str
+):
+    """Return supported entities from the Analyzer Engine."""
+    # return analyzer_engine(
+    #     model_family, model_path, ta_key, ta_endpoint
+    # ).get_supported_entities() + ["GENERIC_PII"]
+    return  ["PERSON", "IBAN_CODE", "PHONE_NUMBER", "CREDIT_CARD", "CRYPTO", "DATE_TIME", "EMAIL_ADDRESS",  "IP_ADDRESS", "NRP", "LOCATION", "URL", "FRENCH_SSN", "FRENCH_PASS", "FRENCH_NID"]
+@st.cache_data
+def analyze(
+    model_family: str, model_path: str, ta_key: str, ta_endpoint: str, **kwargs
+):
+    """Analyze input using Analyzer engine and input arguments (kwargs)."""
+    if "entities" not in kwargs or "All" in kwargs["entities"]:
+        kwargs["entities"] = None
+    if "deny_list" in kwargs and kwargs["deny_list"] is not None:
+        ad_hoc_recognizer = create_ad_hoc_deny_list_recognizer(kwargs["deny_list"])
+        kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else []
+        del kwargs["deny_list"]
+    if "regex_params" in kwargs and len(kwargs["regex_params"]) > 0:
+        ad_hoc_recognizer = create_ad_hoc_regex_recognizer(*kwargs["regex_params"])
+        kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else []
+        del kwargs["regex_params"]
+    return analyzer_engine(model_family, model_path, ta_key, ta_endpoint).analyze(
+        **kwargs
+    )
+def anonymize(
+    text: str,
+    operator: str,
+    analyze_results: List[RecognizerResult],
+    mask_char: Optional[str] = None,
+    number_of_chars: Optional[str] = None,
+    encrypt_key: Optional[str] = None,
+):
+    """Anonymize identified input using Presidio Anonymizer.
+    :param text: Full text
+    :param operator: Operator name
+    :param mask_char: Mask char (for mask operator)
+    :param number_of_chars: Number of characters to mask (for mask operator)
+    :param encrypt_key: Encryption key (for encrypt operator)
+    :param analyze_results: list of results from presidio analyzer engine
+    """
+    if operator == "mask":
+        operator_config = {
+            "type": "mask",
+            "masking_char": mask_char,
+            "chars_to_mask": number_of_chars,
+            "from_end": False,
+        }
+    # Define operator config
+    elif operator == "encrypt":
+        operator_config = {"key": encrypt_key}
+    elif operator == "highlight":
+        operator_config = {"lambda": lambda x: x}
+    else:
+        operator_config = None
+    # Change operator if needed as intermediate step
+    if operator == "highlight":
+        operator = "custom"
+    elif operator == "synthesize":
+        operator = "replace"
+    else:
+        operator = operator
+    res = anonymizer_engine().anonymize(
+        text,
+        analyze_results,
+        operators={"DEFAULT": OperatorConfig(operator, operator_config)},
+    )
+    return res
+def annotate(text: str, analyze_results: List[RecognizerResult]):
+    """Highlight the identified PII entities on the original text
+    :param text: Full text
+    :param analyze_results: list of results from presidio analyzer engine
+    """
+    tokens = []
+    # Use the anonymizer to resolve overlaps
+    results = anonymize(
+        text=text,
+        operator="highlight",
+        analyze_results=analyze_results,
+    )
+    # sort by start index
+    results = sorted(results.items, key=lambda x: x.start)
+    for i, res in enumerate(results):
+        if i == 0:
+            tokens.append(text[: res.start])
+        # append entity text and entity type
+        tokens.append((text[res.start : res.end], res.entity_type))
+        # if another entity coming i.e. we're not at the last results element, add text up to next entity
+        if i != len(results) - 1:
+            tokens.append(text[res.end : results[i + 1].start])
+        # if no more entities coming, add all remaining text
+        else:
+            tokens.append(text[res.end :])
+    return tokens
+# def create_fake_data(
+#     text: str,
+#     analyze_results: List[RecognizerResult],
+#     openai_params: OpenAIParams,
+# ):
+#     """Creates a synthetic version of the text using OpenAI APIs"""
+#     if not openai_params.openai_key:
+#         return "Please provide your OpenAI key"
+#     results = anonymize(text=text, operator="replace", analyze_results=analyze_results)
+#     set_openai_params(openai_params)
+#     prompt = create_prompt(results.text)
+#     print(f"Prompt: {prompt}")
+#     fake = call_openai_api(
+#         prompt=prompt,
+#         openai_model_name=openai_params.model,
+#         openai_deployment_name=openai_params.deployment_name,
+#     )
+#     return fake
+# @st.cache_data
+# def call_openai_api(
+#     prompt: str, openai_model_name: str, openai_deployment_name: Optional[str] = None
+# ) -> str:
+#     fake_data = call_completion_model(
+#         prompt, model=openai_model_name, deployment_id=openai_deployment_name
+#     )
+#     return fake_data
+def create_ad_hoc_deny_list_recognizer(
+    deny_list=Optional[List[str]],
+) -> Optional[PatternRecognizer]:
+    if not deny_list:
+        return None
+    deny_list_recognizer = PatternRecognizer(
+        supported_entity="GENERIC_PII", deny_list=deny_list
+    )
+    return deny_list_recognizer
+def create_ad_hoc_regex_recognizer(
+    regex: str, entity_type: str, score: float, context: Optional[List[str]] = None
+) -> Optional[PatternRecognizer]:
+    if not regex:
+        return None
+    pattern = Pattern(name="Regex pattern", regex=regex, score=score)
+    regex_recognizer = PatternRecognizer(
+        supported_entity=entity_type, patterns=[pattern], context=context
+    )
+    return regex_recognizer

presidio_nlp_engine_config.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from typing import Tuple
+import logging
+import spacy
+from presidio_analyzer import RecognizerRegistry
+from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
+logger = logging.getLogger("presidio-streamlit")
+def create_nlp_engine_with_spacy(
+    model_path: str,
+) -> Tuple[NlpEngine, RecognizerRegistry]:
+    """
+    Instantiate an NlpEngine with a spaCy model
+    :param model_path: spaCy model path.
+    """
+    if not spacy.util.is_package(model_path):
+        spacy.cli.download(model_path)
+    nlp_configuration = {
+        "nlp_engine_name": "spacy",
+        "models": [{"lang_code": model_path.split('_')[0], "model_name": model_path}],
+    }
+    nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+    registry = RecognizerRegistry()
+    # registry.load_predefined_recognizers()
+    registry.load_predefined_recognizers(nlp_engine=nlp_engine, languages=["fr", "en"])
+    registry.add_recognizers_from_yaml("recognizers.yaml")
+    return nlp_engine, registry
+# def create_nlp_engine_with_transformers(
+#     model_path: str,
+# ) -> Tuple[NlpEngine, RecognizerRegistry]:
+#     """
+#     Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model.
+#     The TransformersRecognizer would return results from Transformers models, the spaCy model
+#     would return NlpArtifacts such as POS and lemmas.
+#     :param model_path: HuggingFace model path.
+#     """
+#
+#     from transformers_rec import (
+#         STANFORD_COFIGURATION,
+#         BERT_DEID_CONFIGURATION,
+#         TransformersRecognizer,
+#     )
+#
+#     registry = RecognizerRegistry()
+#     registry.load_predefined_recognizers()
+#
+#     if not spacy.util.is_package("en_core_web_sm"):
+#         spacy.cli.download("en_core_web_sm")
+#     # Using a small spaCy model + a HF NER model
+#     transformers_recognizer = TransformersRecognizer(model_path=model_path)
+#
+#     if model_path == "StanfordAIMI/stanford-deidentifier-base":
+#         transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
+#     elif model_path == "obi/deid_roberta_i2b2":
+#         transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
+#     else:
+#         print(f"Warning: Model has no configuration, loading default.")
+#         transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
+#
+#     # Use small spaCy model, no need for both spacy and HF models
+#     # The transformers model is used here as a recognizer, not as an NlpEngine
+#     nlp_configuration = {
+#         "nlp_engine_name": "spacy",
+#         "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
+#     }
+#
+#     registry.add_recognizer(transformers_recognizer)
+#     registry.remove_recognizer("SpacyRecognizer")
+#
+#     nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+#
+#     return nlp_engine, registry
+# def create_nlp_engine_with_flair(
+#     model_path: str,
+# ) -> Tuple[NlpEngine, RecognizerRegistry]:
+#     """
+#     Instantiate an NlpEngine with a FlairRecognizer and a small spaCy model.
+#     The FlairRecognizer would return results from Flair models, the spaCy model
+#     would return NlpArtifacts such as POS and lemmas.
+#     :param model_path: Flair model path.
+#     """
+#     from flair_recognizer import FlairRecognizer
+#
+#     registry = RecognizerRegistry()
+#     registry.load_predefined_recognizers()
+#
+#     if not spacy.util.is_package("en_core_web_sm"):
+#         spacy.cli.download("en_core_web_sm")
+#     # Using a small spaCy model + a Flair NER model
+#     flair_recognizer = FlairRecognizer(model_path=model_path)
+#     nlp_configuration = {
+#         "nlp_engine_name": "spacy",
+#         "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
+#     }
+#     registry.add_recognizer(flair_recognizer)
+#     registry.remove_recognizer("SpacyRecognizer")
+#
+#     nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+#
+#     return nlp_engine, registry
+# def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
+#     """
+#     Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
+#     The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model
+#     would return NlpArtifacts such as POS and lemmas.
+#     :param ta_key: Azure Text Analytics key.
+#     :param ta_endpoint: Azure Text Analytics endpoint.
+#     """
+#     from text_analytics_wrapper import TextAnalyticsWrapper
+#
+#     if not ta_key or not ta_endpoint:
+#         raise RuntimeError("Please fill in the Text Analytics endpoint details")
+#
+#     registry = RecognizerRegistry()
+#     registry.load_predefined_recognizers()
+#
+#     ta_recognizer = TextAnalyticsWrapper(ta_endpoint=ta_endpoint, ta_key=ta_key)
+#     nlp_configuration = {
+#         "nlp_engine_name": "spacy",
+#         "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
+#     }
+#
+#     nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+#
+#     registry.add_recognizer(ta_recognizer)
+#     registry.remove_recognizer("SpacyRecognizer")
+#
+#     return nlp_engine, registry

presidio_streamlit.py ADDED Viewed

	@@ -0,0 +1,352 @@

+"""Streamlit app for Presidio."""
+import logging
+import os
+import traceback
+import dotenv
+import pandas as pd
+import streamlit as st
+import streamlit.components.v1 as components
+from annotated_text import annotated_text
+from streamlit_tags import st_tags
+# from openai_fake_data_generator import OpenAIParams
+from presidio_helpers import (
+    get_supported_entities,
+    analyze,
+    anonymize,
+    annotate,
+    # create_fake_data,
+    analyzer_engine,
+)
+st.set_page_config(
+    page_title="Presidio demo",
+    layout="wide",
+    initial_sidebar_state="expanded",
+    # menu_items={
+    #     "About": "https://microsoft.github.io/presidio/",
+    # },
+)
+dotenv.load_dotenv()
+logger = logging.getLogger("presidio-streamlit")
+allow_other_models = os.getenv("ALLOW_OTHER_MODELS", False)
+# Sidebar
+st.sidebar.header(
+    """
+Personal Info Anonymization
+"""
+)
+# set aliae logo
+st.sidebar.image('logo.png', use_column_width=True)
+model_help_text = """
+    Select which Named Entity Recognition (NER) model to use for PII detection, in parallel to rule-based recognizers.
+    Presidio supports multiple NER packages off-the-shelf, such as spaCy, Huggingface, Stanza and Flair,
+    as well as service such as Azure Text Analytics PII.
+    """
+st_ta_key = st_ta_endpoint = ""
+model_list = [
+    "spaCy/en_core_web_lg",
+    "spaCy/fr_core_news_md",
+]
+# "flair/ner-english-large",
+#
+# "HuggingFace/StanfordAIMI/stanford-deidentifier-base",
+# "Azure Text Analytics PII",
+# "Other",
+# if not allow_other_models:
+#     model_list.pop()
+# Select model
+lang = st.sidebar.selectbox(
+    "Language",
+    ['en','fr'],
+    index=0,
+)
+# Extract model package.
+# st_model_package = st_model.split("/")[0]
+st_model_package = 'spaCy'
+# # Remove package prefix (if needed)
+# st_model = (
+#     st_model
+#     if st_model_package not in ("spaCy", "HuggingFace")
+#     else "/".join(st_model.split("/")[1:])
+# )
+st_model = 'en_core_web_lg'
+if lang =='en': st_model = 'en_core_web_lg'
+elif lang == 'fr' : st_model = 'fr_core_news_md'
+# if st_model == "Other":
+#     st_model_package = st.sidebar.selectbox(
+#         "NER model OSS package", options=["spaCy", "Flair", "HuggingFace"]
+#     )
+#     st_model = st.sidebar.text_input(f"NER model name", value="")
+# if st_model == "Azure Text Analytics PII":
+#     st_ta_key = st.sidebar.text_input(
+#         f"Text Analytics key", value=os.getenv("TA_KEY", ""), type="password"
+#     )
+#     st_ta_endpoint = st.sidebar.text_input(
+#         f"Text Analytics endpoint",
+#         value=os.getenv("TA_ENDPOINT", default=""),
+#         help="For more info: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/overview",  # noqa: E501
+#     )
+# st.sidebar.warning("Note: Models might take some time to download. ")
+analyzer_params = (st_model_package, st_model, st_ta_key, st_ta_endpoint)
+logger.debug(f"analyzer_params: {analyzer_params}")
+st_operator = st.sidebar.selectbox(
+    "De-identification approach",
+    ["redact", "replace", "highlight"],
+    index=2,
+    help="""
+    Select which manipulation to the text is requested after PII has been identified.\n
+    - Redact: Completely remove the PII text\n
+    - Replace: Replace the PII text with a constant, e.g. <PERSON>\n
+    - Highlight: Shows the original text with PII highlighted in colors\n
+         """,
+)
+st_mask_char = "*"
+st_number_of_chars = 15
+st_encrypt_key = "WmZq4t7w!z%C&F)J"
+open_ai_params = None
+logger.debug(f"st_operator: {st_operator}")
+# if st_operator == "mask":
+#     st_number_of_chars = st.sidebar.number_input(
+#         "number of chars", value=st_number_of_chars, min_value=0, max_value=100
+#     )
+#     st_mask_char = st.sidebar.text_input(
+#         "Mask character", value=st_mask_char, max_chars=1
+#     )
+# elif st_operator == "encrypt":
+#     st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
+# elif st_operator == "synthesize":
+#     if os.getenv("OPENAI_TYPE", default="openai") == "Azure":
+#         openai_api_type = "azure"
+#         st_openai_api_base = st.sidebar.text_input(
+#             "Azure OpenAI base URL",
+#             value=os.getenv("AZURE_OPENAI_ENDPOINT", default=""),
+#         )
+#         st_deployment_name = st.sidebar.text_input(
+#             "Deployment name", value=os.getenv("AZURE_OPENAI_DEPLOYMENT", default="")
+#         )
+#         st_openai_version = st.sidebar.text_input(
+#             "OpenAI version",
+#             value=os.getenv("OPENAI_API_VERSION", default="2023-05-15"),
+#         )
+#     else:
+#         st_openai_version = openai_api_type = st_openai_api_base = None
+#         st_deployment_name = ""
+#     st_openai_key = st.sidebar.text_input(
+#         "OPENAI_KEY",
+#         value=os.getenv("OPENAI_KEY", default=""),
+#         help="See https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key for more info.",
+#         type="password",
+#     )
+#     st_openai_model = st.sidebar.text_input(
+#         "OpenAI model for text synthesis",
+#         value=os.getenv("OPENAI_MODEL", default="text-davinci-003"),
+#         help="See more here: https://platform.openai.com/docs/models/",
+#     )
+#
+#     open_ai_params = OpenAIParams(
+#         openai_key=st_openai_key,
+#         model=st_openai_model,
+#         api_base=st_openai_api_base,
+#         deployment_name=st_deployment_name,
+#         api_version=st_openai_version,
+#         api_type=openai_api_type,
+#     )
+# st_threshold = st.sidebar.slider(
+#     label="Acceptance threshold",
+#     min_value=0.0,
+#     max_value=1.0,
+#     value=0.35,
+#     help="Define the threshold for accepting a detection as PII. See more here: ",
+# )
+st_threshold = 0.35
+#
+# st_return_decision_process = st.sidebar.checkbox(
+#     "Add analysis explanations to findings",
+#     value=False,
+#     help="Add the decision process to the output table. "
+#     "More information can be found here: https://microsoft.github.io/presidio/analyzer/decision_process/",
+# )
+st_return_decision_process = False
+# # Allow and deny lists
+# st_deny_allow_expander = st.sidebar.expander(
+#     "Allowlists and denylists",
+#     expanded=False,
+# )
+#
+# with st_deny_allow_expander:
+#     st_allow_list = st_tags(
+#         label="Add words to the allowlist", text="Enter word and press enter."
+#     )
+#     st.caption(
+#         "Allowlists contain words that are not considered PII, but are detected as such."
+#     )
+#
+#     st_deny_list = st_tags(
+#         label="Add words to the denylist", text="Enter word and press enter."
+#     )
+#     st.caption(
+#         "Denylists contain words that are considered PII, but are not detected as such."
+#     )
+st_allow_list = []
+st_deny_list = []
+# Main panel
+with st.expander("About Microsoft Presidio", expanded=False):
+    st.info(
+        """Presidio is an open source customizable framework for PII detection and de-identification."""
+    )
+analyzer_load_state = st.info("Starting Presidio analyzer...")
+analyzer_load_state.empty()
+# Read default text
+with open("en_demo_text.txt") as f:
+    en_demo_text = f.readlines()
+with open("fr_demo_text.txt") as f:
+    fr_demo_text = f.readlines()
+if lang == 'en': demo_text = en_demo_text
+elif lang == 'fr': demo_text = fr_demo_text
+# Create two columns for before and after
+col1, col2 = st.columns(2)
+# Before:
+col1.subheader("Input")
+st_text = col1.text_area(
+    label="Enter text", value="".join(demo_text), height=400, key="text_input"
+)
+try:
+    # Choose entities
+    st_entities_expander = st.sidebar.expander("Choose entities to look for")
+    st_entities = st_entities_expander.multiselect(
+        label="Which entities to look for?",
+        options=get_supported_entities(*analyzer_params),
+        default=list(get_supported_entities(*analyzer_params)),
+        help="Limit the list of PII entities detected. "
+        "This list is dynamic and based on the NER model and registered recognizers. "
+        "More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/",
+    )
+    # Before
+    analyzer_load_state = st.info("Starting Presidio analyzer...")
+    analyzer = analyzer_engine(*analyzer_params)
+    analyzer_load_state.empty()
+    st_analyze_results = analyze(
+        *analyzer_params,
+        text=st_text,
+        entities=st_entities,
+        language=lang,
+        score_threshold=st_threshold,
+        return_decision_process=st_return_decision_process,
+        allow_list=st_allow_list,
+        deny_list=st_deny_list,
+    )
+    # After
+    if st_operator not in ("highlight", "synthesize"):
+        with col2:
+            st.subheader(f"Output")
+            st_anonymize_results = anonymize(
+                text=st_text,
+                operator=st_operator,
+                mask_char=st_mask_char,
+                number_of_chars=st_number_of_chars,
+                encrypt_key=st_encrypt_key,
+                analyze_results=st_analyze_results,
+            )
+            st.text_area(
+                label="De-identified", value=st_anonymize_results.text, height=400
+            )
+    # elif st_operator == "synthesize":
+    #     with col2:
+    #         st.subheader(f"OpenAI Generated output")
+    #         fake_data = create_fake_data(
+    #             st_text,
+    #             st_analyze_results,
+    #             open_ai_params,
+    #         )
+    #         st.text_area(label="Synthetic data", value=fake_data, height=400)
+    else:
+        st.subheader("Highlighted")
+        annotated_tokens = annotate(text=st_text, analyze_results=st_analyze_results)
+        # annotated_tokens
+        annotated_text(*annotated_tokens)
+    # table result
+    st.subheader(
+        "Findings"
+        if not st_return_decision_process
+        else "Findings with decision factors"
+    )
+    if st_analyze_results:
+        df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
+        df["text"] = [st_text[res.start : res.end] for res in st_analyze_results]
+        df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
+            {
+                "entity_type": "Entity type",
+                "text": "Text",
+                "start": "Start",
+                "end": "End",
+                "score": "Confidence",
+            },
+            axis=1,
+        )
+        df_subset["Text"] = [st_text[res.start : res.end] for res in st_analyze_results]
+        if st_return_decision_process:
+            analysis_explanation_df = pd.DataFrame.from_records(
+                [r.analysis_explanation.to_dict() for r in st_analyze_results]
+            )
+            df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
+        st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
+    else:
+        st.text("No findings")
+except Exception as e:
+    print(e)
+    traceback.print_exc()
+    st.error(e)
+components.html(
+    """
+    <script type="text/javascript">
+    (function(c,l,a,r,i,t,y){
+        c[a]=c[a]||function(){(c[a].q=c[a].q||[]).push(arguments)};
+        t=l.createElement(r);t.async=1;t.src="https://www.clarity.ms/tag/"+i;
+        y=l.getElementsByTagName(r)[0];y.parentNode.insertBefore(t,y);
+    })(window, document, "clarity", "script", "h7f8bp42n8");
+    </script>
+    """
+)

recognizers.yaml ADDED Viewed

	@@ -0,0 +1,100 @@

+recognizers:
+  -
+    name: "FRENCH_NID"
+    supported_language: "fr"
+    patterns:
+      -
+         name: "FRENCH_NID"
+         regex: "[0-9]{12}|([A-Z]|[0-9]){9}"
+         score: 0.5
+    context:
+      - national
+    supported_entity: "FRENCH_NID"
+  -
+    name: "FRENCH_NID"
+    supported_language: "en"
+    patterns:
+      -
+         name: "FRENCH_NID"
+         regex: "[0-9]{12}|([A-Z]|[0-9]){9}"
+         score: 0.5
+    context:
+     - national
+    supported_entity: "FRENCH_NID"
+  -
+    name: "FRENCH_PASS"
+    supported_language: "fr"
+    patterns:
+      -
+         name: "FRENCH_PASS"
+         regex: "[0-9]{2}([a-z]|[A-Z]){2}[0-9]{5}"
+         score: 0.5
+    context:
+     - passeport
+    supported_entity: "FRENCH_PASS"
+  -
+    name: "FRENCH_PASS"
+    supported_language: "en"
+    patterns:
+      -
+         name: "FRENCH_PASS"
+         regex: "[0-9]{2}([a-z]|[A-Z]){2}[0-9]{5}"
+         score: 0.5
+    context:
+     - passport
+    supported_entity: "FRENCH_PASS"
+  -
+    name: "FRENCH_SSN"
+    supported_language: "fr"
+    patterns:
+      -
+         name: "FRENCH_SSN"
+         regex: "[0-9]{15}"
+         score: 0.5
+    context:
+     - sécurité sociale
+     - social
+    supported_entity: "FRENCH_SSN"
+  -
+    name: "FRENCH_SSN"
+    supported_language: "en"
+    patterns:
+      -
+         name: "FRENCH_SSN"
+         regex: "[0-9]{15}"
+         score: 0.5
+    context:
+     - social security
+     - social
+    supported_entity: "FRENCH_SSN"
+#  -
+#    name: "CREDIT_CARD"
+#    supported_language: "fr"
+#    context:
+#     - crédit
+#     - carte
+#     - carte de crédit
+#    supported_entity: "CREDIT_CARD"
+#    deny_list:
+#      - carte
+#  -
+#    name: "DATE_TIME"
+#    supported_language: "fr"
+#    context:
+#     - mois
+#     - date
+#     - jour
+#     - année
+#    supported_entity: "DATE_TIME"
+#    deny_list:
+#      - mois
+#  -
+#    name: "PHONE_NUMBER"
+#    supported_language: "fr"
+#    context:
+#     - téléphone
+#    supported_entity: "PHONE_NUMBER"
+#    deny_list:
+#      - téléphone

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+presidio-analyzer
+presidio-anonymizer
+streamlit
+streamlit-tags
+pandas
+python-dotenv
+st-annotated-text
+torch
+transformers
+flair
+openai
+spacy
+azure-ai-textanalytics