-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
- Loading branch information
whylabs
committed
Feb 22, 2024
1 parent
fab7b3a
commit 99e6a3e
Showing
18 changed files
with
95 additions
and
83 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,9 @@ | ||
from functools import cache | ||
from typing import Any, Dict, List, Mapping, Optional | ||
|
||
import pandas as pd | ||
import spacy | ||
from presidio_analyzer import AnalyzerEngine, RecognizerResult | ||
from presidio_analyzer.nlp_engine import TransformersNlpEngine | ||
from presidio_anonymizer import AnonymizerEngine | ||
from whylogs_container_types import ContainerConfiguration, LangkitOptions | ||
|
||
|
@@ -13,25 +13,27 @@ | |
from langkit.metrics.library import lib | ||
|
||
|
||
@cache | ||
def get_analyzer() -> AnalyzerEngine: | ||
return AnalyzerEngine() | ||
|
||
|
||
@cache | ||
def get_anonymizer() -> AnonymizerEngine: | ||
return AnonymizerEngine() | ||
|
||
|
||
def custom_presidio_metric(input_name: str) -> MetricCreator: | ||
# Define which transformers model to use | ||
model_config = [ | ||
{ | ||
"lang_code": "en", | ||
"model_name": { | ||
"spacy": "en_core_web_sm", # use a small spaCy model for lemmas, tokens etc. | ||
"transformers": "dslim/bert-base-NER", | ||
}, | ||
} | ||
] | ||
nlp_engine = TransformersNlpEngine(models=model_config) | ||
analyzer = AnalyzerEngine(nlp_engine=nlp_engine) | ||
anonymizer = AnonymizerEngine() | ||
def cache_assets(): | ||
spacy.load("en_core_web_lg") | ||
|
||
def init(): | ||
spacy.load("en_core_web_sm") | ||
get_analyzer() | ||
get_anonymizer() | ||
|
||
def udf(text: pd.DataFrame) -> MultiMetricResult: | ||
analyzer = get_analyzer() | ||
anonymizer = get_anonymizer() | ||
entity_types = { | ||
"PHONE_NUMBER": f"{input_name}.pii.phone_number", | ||
"EMAIL_ADDRESS": f"{input_name}.pii.email_address", | ||
|
@@ -83,16 +85,10 @@ def udf(text: pd.DataFrame) -> MultiMetricResult: | |
f"{input_name}.pii.credit_card", | ||
f"{input_name}.pii.anonymized", | ||
] | ||
return lambda: MultiMetric(names=metric_names, input_name=input_name, evaluate=udf, init=init) | ||
return lambda: MultiMetric(names=metric_names, input_name=input_name, evaluate=udf, init=init, cache_assets=cache_assets) | ||
|
||
|
||
class MyCallback(Callback): | ||
def post_evaluation(self, metric_results: Mapping[str, MetricResult]) -> None: | ||
""" | ||
This method is called right after all of the metrics run. | ||
""" | ||
pass | ||
|
||
def post_validation( | ||
self, | ||
df: pd.DataFrame, | ||
|
@@ -142,14 +138,14 @@ def get_config() -> ContainerConfiguration: | |
wf = EvaluationWorkflow(metrics=[custom_presidio_metric("prompt")]) | ||
|
||
# If you want, you can run this file directly to see the output of the workflow | ||
# df = pd.DataFrame( | ||
# { | ||
# "prompt": [ | ||
# "Hey! Here is my phone number: 555-555-5555, and my email is [email protected]. And my friend's email is [email protected]", | ||
# "no pii here", | ||
# ], | ||
# "response": ["YOINK", "good job"], | ||
# } | ||
# ) | ||
# result = wf.evaluate(df) | ||
# print(result.features.transpose()) | ||
df = pd.DataFrame( | ||
{ | ||
"prompt": [ | ||
"Hey! Here is my phone number: 555-555-5555, and my email is [email protected]. And my friend's email is [email protected]", | ||
"no pii here", | ||
], | ||
"response": ["YOINK", "good job"], | ||
} | ||
) | ||
result = wf.run(df) | ||
print(result.metrics.transpose()) # type: ignore |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Oops, something went wrong.