diff --git a/presidio-analyzer/conf/spacy_llm.yaml b/presidio-analyzer/conf/spacy_llm.yaml new file mode 100644 index 000000000..d3e3f560d --- /dev/null +++ b/presidio-analyzer/conf/spacy_llm.yaml @@ -0,0 +1,37 @@ +nlp_engine_name: spacy-llm +models: + - + lang_code: en + model_name: spacy.GPT-4.v2 + +ner_model_configuration: + labels_to_ignore: + - O + model_to_presidio_entity_mapping: + PER: PERSON + PERSON: PERSON + LOC: LOCATION + LOCATION: LOCATION + GPE: LOCATION + ORG: ORGANIZATION + ORGANIZATION: ORGANIZATION + NORP: NRP + AGE: AGE + ID: ID + EMAIL: EMAIL + PATIENT: PERSON + STAFF: PERSON + HOSP: ORGANIZATION + PATORG: ORGANIZATION + DATE: DATE_TIME + TIME: DATE_TIME + PHONE: PHONE_NUMBER + HCW: PERSON + HOSPITAL: ORGANIZATION + FACILITY: LOCATION + IP_ADDRESS: IP_ADDRESS + PHONE_NUMBER: PHONE_NUMBER + CREDIT_CARD: CREDIT_CARD + URL: URL + CRYPTO: CRYPTO + PII: GENERIC_PII diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py index b0701be7a..8058a3eb2 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py @@ -39,7 +39,7 @@ def __init__( nlp_configuration: Optional[Dict] = None, ): if not nlp_engines: - nlp_engines = (SpacyNlpEngine, StanzaNlpEngine, TransformersNlpEngine) + nlp_engines = NlpEngineProvider.get_all_nlp_engines() self.nlp_engines = { engine.engine_name: engine for engine in nlp_engines if engine.is_available @@ -136,3 +136,14 @@ def _get_full_conf_path( ) -> Path: """Return a Path to the default conf file.""" return Path(Path(__file__).parent.parent.parent, "conf", default_conf_file) + + @staticmethod + def get_all_nlp_engines(cls=None): + """Return all subclasses of NlpEngine.""" + + if not cls: + cls = NlpEngine + + return set(cls.__subclasses__()).union( + [s for c in cls.__subclasses__() for s + in NlpEngineProvider.get_all_nlp_engines(c)]) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_llm.cfg b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_llm.cfg new file mode 100644 index 000000000..4e51a453b --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_llm.cfg @@ -0,0 +1,26 @@ +[paths] +examples = null + +[nlp] +lang = "en" +pipeline = ["llm"] + +[components] + +[components.llm] +factory = "llm" + +[components.llm.task] +@llm_tasks = "spacy.NER.v3" +labels = [] +description = Entities are the personally identifiable information (PII) + or private health information (PHI) + such as names, locations, organizations, ID numbers, + and other types of data that can be used to identify an individual. + +[components.llm.task.examples] +@misc = "spacy.FewShotReader.v1" +path = "${paths.spacy_llm_examples}" + +[components.llm.model] +@llm_models = "spacy.GPT-3-5.v1" \ No newline at end of file diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_llm_examples.json b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_llm_examples.json new file mode 100644 index 000000000..c73a4fb4f --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_llm_examples.json @@ -0,0 +1,36 @@ +[ + { + "text": "They stole my credit card. Its number is 15505-212441-24", + "spans": [ + { + "text": "15505-212441-24", + "is_entity": true, + "label": "CREDIT_CARD", + "reason": "A credit card number is mentioned" + }, + { + "text": "stole", + "is_entity": false, + "label": "==NONE==", + "reason": "stole is a non-sensitive word, not PII" + } + ] + }, + { + "text": "My name is Risheek and I live in Grandumbrella", + "spans": [ + { + "text": "Risheek", + "is_entity": true, + "label": "PERSON", + "reason": "a name is mentioned" + }, + { + "text": "Grandumbrella", + "is_entity": true, + "label": "LOCATION", + "reason": "Although unknown, it is mentioned that Risheek lives there." + } + ] + } +] \ No newline at end of file diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_llm_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_llm_nlp_engine.py new file mode 100644 index 000000000..b589fafe4 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_llm_nlp_engine.py @@ -0,0 +1,90 @@ +import logging +import os +from pathlib import Path +from typing import Optional, List, Dict, Any + +from spacy.util import load_config +from spacy_llm.util import assemble_from_config + +from presidio_analyzer.nlp_engine import SpacyNlpEngine, NerModelConfiguration, \ + NlpEngineProvider + +try: + import spacy_llm +except ImportError: + spacy_llm = None + +logger = logging.getLogger("presidio-analyzer") + + +class SpacyLLMNlpEngine(SpacyNlpEngine): + engine_name = "spacy-llm" + is_available = bool(spacy_llm) + + def __init__( + self, + models: Optional[List[Dict[str, str]]] = None, + ner_model_configuration: Optional[NerModelConfiguration] = None, + path_to_config: Optional[str] = "spacy_llm.cfg", + path_to_examples: Optional[str] = "spacy_llm_examples.json" + ): + super().__init__(models, ner_model_configuration) + self.path_to_config = Path(path_to_config) + self.path_to_examples = path_to_examples + + def load(self) -> None: + """Load the NLP model.""" + + api_key = os.getenv("OPENAI_API_KEY") + if api_key is None: + raise ValueError("Could not find the API key to access the OpenAI API. " + "Ensure you have an API key set up via " + "https://platform.openai.com/account/api-keys, " + "then make it available as " + "an environment variable 'OPENAI_API_KEY'.") + + labels = list( + self.ner_model_configuration.model_to_presidio_entity_mapping.keys() + ) + + self.nlp = {} + for model in self.models: + self._validate_model_params(model) + language = model["lang_code"] + overrides = self._get_overrides(language=language, + labels=labels, + examples_path=self.path_to_examples) + config = load_config(self.path_to_config, + overrides=overrides, + interpolate=True) + + nlp = assemble_from_config(config) + self.nlp[model["lang_code"]] = nlp + + @staticmethod + def _get_overrides( + language: str, + labels: List[str], + examples_path: Optional[str] = None, + llm_models: str = "spacy.GPT-3-5.v2", + model_name: str = "gpt-3.5-turbo", + llm_tasks: str = "spacy.NER.v3", + ) -> Dict[str, Any]: + """Create a config dict for the NER model which overrides the defaults.""" + + return { + "nlp.lang": language, + "components.llm.task.labels": labels, + "components.llm.task.examples.path": examples_path, + "components.llm.model.@llm_models": llm_models, + "components.llm.model.name": model_name, + "components.llm.task.@llm_tasks": llm_tasks, + } + + +if __name__ == "__main__": + from dotenv import load_dotenv + load_dotenv() + nlp_engine = NlpEngineProvider(conf_file=Path("../../conf/spacy_llm.yaml")).create_engine() # noqa E501 + nlp_artifacts = nlp_engine.process_text("My name is John Doe and I live in New York.", "en") + print(nlp_artifacts) diff --git a/presidio-analyzer/setup.py b/presidio-analyzer/setup.py index a1326e40a..da4f7c02f 100644 --- a/presidio-analyzer/setup.py +++ b/presidio-analyzer/setup.py @@ -44,6 +44,7 @@ "transformers": ["spacy_huggingface_pipelines"], "stanza": ["stanza", "spacy_stanza"], "azure-ai-language": ["azure-ai-textanalytics", "azure-core"], + "llm": ["spacy-llm", "openai"] }, include_package_data=True, license="MIT",