Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experimental spacy-llm NLP engine #1340

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions presidio-analyzer/conf/spacy_llm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
nlp_engine_name: spacy-llm
models:
-
lang_code: en
model_name: spacy.GPT-4.v2

ner_model_configuration:
labels_to_ignore:
- O
model_to_presidio_entity_mapping:
PER: PERSON
PERSON: PERSON
LOC: LOCATION
LOCATION: LOCATION
GPE: LOCATION
ORG: ORGANIZATION
ORGANIZATION: ORGANIZATION
NORP: NRP
AGE: AGE
ID: ID
EMAIL: EMAIL
PATIENT: PERSON
STAFF: PERSON
HOSP: ORGANIZATION
PATORG: ORGANIZATION
DATE: DATE_TIME
TIME: DATE_TIME
PHONE: PHONE_NUMBER
HCW: PERSON
HOSPITAL: ORGANIZATION
FACILITY: LOCATION
IP_ADDRESS: IP_ADDRESS
PHONE_NUMBER: PHONE_NUMBER
CREDIT_CARD: CREDIT_CARD
URL: URL
CRYPTO: CRYPTO
PII: GENERIC_PII
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def __init__(
nlp_configuration: Optional[Dict] = None,
):
if not nlp_engines:
nlp_engines = (SpacyNlpEngine, StanzaNlpEngine, TransformersNlpEngine)
nlp_engines = NlpEngineProvider.get_all_nlp_engines()

self.nlp_engines = {
engine.engine_name: engine for engine in nlp_engines if engine.is_available
Expand Down Expand Up @@ -136,3 +136,14 @@ def _get_full_conf_path(
) -> Path:
"""Return a Path to the default conf file."""
return Path(Path(__file__).parent.parent.parent, "conf", default_conf_file)

@staticmethod
def get_all_nlp_engines(cls=None):
"""Return all subclasses of NlpEngine."""

if not cls:
cls = NlpEngine

return set(cls.__subclasses__()).union(
[s for c in cls.__subclasses__() for s
in NlpEngineProvider.get_all_nlp_engines(c)])
26 changes: 26 additions & 0 deletions presidio-analyzer/presidio_analyzer/nlp_engine/spacy_llm.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
[paths]
examples = null

[nlp]
lang = "en"
pipeline = ["llm"]

[components]

[components.llm]
factory = "llm"

[components.llm.task]
@llm_tasks = "spacy.NER.v3"
labels = []
description = Entities are the personally identifiable information (PII)
or private health information (PHI)
such as names, locations, organizations, ID numbers,
and other types of data that can be used to identify an individual.

[components.llm.task.examples]
@misc = "spacy.FewShotReader.v1"
path = "${paths.spacy_llm_examples}"

[components.llm.model]
@llm_models = "spacy.GPT-3-5.v1"
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
[
{
"text": "They stole my credit card. Its number is 15505-212441-24",
"spans": [
{
"text": "15505-212441-24",
"is_entity": true,
"label": "CREDIT_CARD",
"reason": "A credit card number is mentioned"
},
{
"text": "stole",
"is_entity": false,
"label": "==NONE==",
"reason": "stole is a non-sensitive word, not PII"
}
]
},
{
"text": "My name is Risheek and I live in Grandumbrella",
"spans": [
{
"text": "Risheek",
"is_entity": true,
"label": "PERSON",
"reason": "a name is mentioned"
},
{
"text": "Grandumbrella",
"is_entity": true,
"label": "LOCATION",
"reason": "Although unknown, it is mentioned that Risheek lives there."
}
]
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import logging
import os
from pathlib import Path
from typing import Optional, List, Dict, Any

from spacy.util import load_config
from spacy_llm.util import assemble_from_config

from presidio_analyzer.nlp_engine import SpacyNlpEngine, NerModelConfiguration, \
NlpEngineProvider

try:
import spacy_llm
except ImportError:
spacy_llm = None

logger = logging.getLogger("presidio-analyzer")


class SpacyLLMNlpEngine(SpacyNlpEngine):
engine_name = "spacy-llm"
is_available = bool(spacy_llm)

def __init__(
self,
models: Optional[List[Dict[str, str]]] = None,
ner_model_configuration: Optional[NerModelConfiguration] = None,
path_to_config: Optional[str] = "spacy_llm.cfg",
path_to_examples: Optional[str] = "spacy_llm_examples.json"
):
super().__init__(models, ner_model_configuration)
self.path_to_config = Path(path_to_config)
self.path_to_examples = path_to_examples

def load(self) -> None:
"""Load the NLP model."""

api_key = os.getenv("OPENAI_API_KEY")
if api_key is None:
raise ValueError("Could not find the API key to access the OpenAI API. "
"Ensure you have an API key set up via "
"https://platform.openai.com/account/api-keys, "
"then make it available as "
"an environment variable 'OPENAI_API_KEY'.")

labels = list(
self.ner_model_configuration.model_to_presidio_entity_mapping.keys()
)

self.nlp = {}
for model in self.models:
self._validate_model_params(model)
language = model["lang_code"]
overrides = self._get_overrides(language=language,
labels=labels,
examples_path=self.path_to_examples)
config = load_config(self.path_to_config,
overrides=overrides,
interpolate=True)

nlp = assemble_from_config(config)
self.nlp[model["lang_code"]] = nlp

@staticmethod
def _get_overrides(
language: str,
labels: List[str],
examples_path: Optional[str] = None,
llm_models: str = "spacy.GPT-3-5.v2",
model_name: str = "gpt-3.5-turbo",
llm_tasks: str = "spacy.NER.v3",
) -> Dict[str, Any]:
"""Create a config dict for the NER model which overrides the defaults."""

return {
"nlp.lang": language,
"components.llm.task.labels": labels,
"components.llm.task.examples.path": examples_path,
"components.llm.model.@llm_models": llm_models,
"components.llm.model.name": model_name,
"components.llm.task.@llm_tasks": llm_tasks,
}


if __name__ == "__main__":
from dotenv import load_dotenv
load_dotenv()
nlp_engine = NlpEngineProvider(conf_file=Path("../../conf/spacy_llm.yaml")).create_engine() # noqa E501
nlp_artifacts = nlp_engine.process_text("My name is John Doe and I live in New York.", "en")
print(nlp_artifacts)
1 change: 1 addition & 0 deletions presidio-analyzer/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
"transformers": ["spacy_huggingface_pipelines"],
"stanza": ["stanza", "spacy_stanza"],
"azure-ai-language": ["azure-ai-textanalytics", "azure-core"],
"llm": ["spacy-llm", "openai"]
},
include_package_data=True,
license="MIT",
Expand Down
Loading