Anonymizer_demo / presidio_nlp_engine_config.py
Farnazgh's picture
add files
b494f67
raw
history blame
5.19 kB
from typing import Tuple
import logging
import spacy
from presidio_analyzer import RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
logger = logging.getLogger("presidio-streamlit")
def create_nlp_engine_with_spacy(
model_path: str,
) -> Tuple[NlpEngine, RecognizerRegistry]:
"""
Instantiate an NlpEngine with a spaCy model
:param model_path: spaCy model path.
"""
if not spacy.util.is_package(model_path):
spacy.cli.download(model_path)
nlp_configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": model_path.split('_')[0], "model_name": model_path}],
}
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
registry = RecognizerRegistry()
# registry.load_predefined_recognizers()
registry.load_predefined_recognizers(nlp_engine=nlp_engine, languages=["fr", "en"])
registry.add_recognizers_from_yaml("recognizers.yaml")
return nlp_engine, registry
# def create_nlp_engine_with_transformers(
# model_path: str,
# ) -> Tuple[NlpEngine, RecognizerRegistry]:
# """
# Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model.
# The TransformersRecognizer would return results from Transformers models, the spaCy model
# would return NlpArtifacts such as POS and lemmas.
# :param model_path: HuggingFace model path.
# """
#
# from transformers_rec import (
# STANFORD_COFIGURATION,
# BERT_DEID_CONFIGURATION,
# TransformersRecognizer,
# )
#
# registry = RecognizerRegistry()
# registry.load_predefined_recognizers()
#
# if not spacy.util.is_package("en_core_web_sm"):
# spacy.cli.download("en_core_web_sm")
# # Using a small spaCy model + a HF NER model
# transformers_recognizer = TransformersRecognizer(model_path=model_path)
#
# if model_path == "StanfordAIMI/stanford-deidentifier-base":
# transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
# elif model_path == "obi/deid_roberta_i2b2":
# transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
# else:
# print(f"Warning: Model has no configuration, loading default.")
# transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
#
# # Use small spaCy model, no need for both spacy and HF models
# # The transformers model is used here as a recognizer, not as an NlpEngine
# nlp_configuration = {
# "nlp_engine_name": "spacy",
# "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
# }
#
# registry.add_recognizer(transformers_recognizer)
# registry.remove_recognizer("SpacyRecognizer")
#
# nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
#
# return nlp_engine, registry
# def create_nlp_engine_with_flair(
# model_path: str,
# ) -> Tuple[NlpEngine, RecognizerRegistry]:
# """
# Instantiate an NlpEngine with a FlairRecognizer and a small spaCy model.
# The FlairRecognizer would return results from Flair models, the spaCy model
# would return NlpArtifacts such as POS and lemmas.
# :param model_path: Flair model path.
# """
# from flair_recognizer import FlairRecognizer
#
# registry = RecognizerRegistry()
# registry.load_predefined_recognizers()
#
# if not spacy.util.is_package("en_core_web_sm"):
# spacy.cli.download("en_core_web_sm")
# # Using a small spaCy model + a Flair NER model
# flair_recognizer = FlairRecognizer(model_path=model_path)
# nlp_configuration = {
# "nlp_engine_name": "spacy",
# "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
# }
# registry.add_recognizer(flair_recognizer)
# registry.remove_recognizer("SpacyRecognizer")
#
# nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
#
# return nlp_engine, registry
# def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
# """
# Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
# The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model
# would return NlpArtifacts such as POS and lemmas.
# :param ta_key: Azure Text Analytics key.
# :param ta_endpoint: Azure Text Analytics endpoint.
# """
# from text_analytics_wrapper import TextAnalyticsWrapper
#
# if not ta_key or not ta_endpoint:
# raise RuntimeError("Please fill in the Text Analytics endpoint details")
#
# registry = RecognizerRegistry()
# registry.load_predefined_recognizers()
#
# ta_recognizer = TextAnalyticsWrapper(ta_endpoint=ta_endpoint, ta_key=ta_key)
# nlp_configuration = {
# "nlp_engine_name": "spacy",
# "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
# }
#
# nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
#
# registry.add_recognizer(ta_recognizer)
# registry.remove_recognizer("SpacyRecognizer")
#
# return nlp_engine, registry