Update from https://github.com/whylabs-ai/whylogs-container-python/co…

…mmit/b17052e9034ff8ab23fc3e29107242132e379ff2
whylabs · Feb 22, 2024 · 99e6a3e · 99e6a3e
1 parent fab7b3a
commit 99e6a3e
Show file tree

Hide file tree

Showing 18 changed files with 95 additions and 83 deletions.
diff --git a/examples/configure_container_python/Dockerfile b/examples/configure_container_python/Dockerfile
@@ -1,4 +1,4 @@
-FROM whylabs/whylogs:py-llm-1.0.3
+FROM whylabs/whylogs:py-llm-1.0.4
 
 # Force the container to fail if the config is not present. Safeguard for messing up the
 # build in such a way that the config is not included correctly.

diff --git a/examples/configure_container_python/poetry.lock b/examples/configure_container_python/poetry.lock
diff --git a/examples/configure_container_python/pyproject.toml b/examples/configure_container_python/pyproject.toml
@@ -13,7 +13,7 @@ python = "^3.10"
 # These are all dev dependencies. They're already included in the container and we don't want to
 # overwrite those versions, we just want types and auto completion in this project.
 langkit = {url = "https://whypy.s3.us-west-2.amazonaws.com/langkit-0.0.43-py3-none-any.whl"}
-whylogs-container-client = "1.0.3"
+whylogs-container-client = "1.0.4"
 whylogs-container-types = {url = "https://whypy.s3.us-west-2.amazonaws.com/whylogs_container_types-0.4.10-py3-none-any.whl"}
 pandas = "1.3.5"
 

diff --git a/examples/configure_container_yaml/Dockerfile b/examples/configure_container_yaml/Dockerfile
@@ -1,4 +1,4 @@
-FROM whylabs/whylogs:py-llm-1.0.3
+FROM whylabs/whylogs:py-llm-1.0.4
 
 # Force the container to fail if the config is not present. Safeguard for messing up the
 # build in such a way that the config is not included correctly.

diff --git a/examples/configure_container_yaml/poetry.lock b/examples/configure_container_yaml/poetry.lock
diff --git a/examples/configure_container_yaml/pyproject.toml b/examples/configure_container_yaml/pyproject.toml
@@ -12,7 +12,7 @@ python = "^3.10"
 [tool.poetry.group.dev.dependencies]
 # These are all dev dependencies. They're already included in the container and we don't want to
 # overwrite those versions, we just want types and auto completion in this project.
-whylogs-container-client = "1.0.3"
+whylogs-container-client = "1.0.4"
 pandas = "1.3.5"
 
 pyright = "^1.1.347"

diff --git a/examples/custom_model/Dockerfile b/examples/custom_model/Dockerfile
@@ -1,4 +1,4 @@
-FROM whylabs/whylogs:py-llm-1.0.3
+FROM whylabs/whylogs:py-llm-1.0.4
 
 # Force the container to fail if the config is not present. Safeguard for messing up the
 # build in such a way that the config is not included correctly.
@@ -10,6 +10,12 @@ RUN /bin/bash -c "source .venv/bin/activate; pip install -r ./requirements.txt"
 # Copy our custom config code
 COPY ./whylogs_config /opt/whylogs-container/whylogs_container/whylogs_config/
 
+# The base docker image disables hugging face downloads by default so we need to temporarily
+# enable them here
+ENV TRANSFORMERS_OFFLINE=0
+ENV HF_DATASETS_OFFLINE=0
 # Run our __main__ in our config.py file to trigger model downloads now
 # so we don't have to do it at container launch time.
 RUN /bin/bash -c "source .venv/bin/activate; python ./whylogs_container/whylogs_config/config.py"
+ENV TRANSFORMERS_OFFLINE=1
+ENV HF_DATASETS_OFFLINE=1
diff --git a/examples/custom_model/poetry.lock b/examples/custom_model/poetry.lock
diff --git a/examples/custom_model/pyproject.toml b/examples/custom_model/pyproject.toml
@@ -18,9 +18,9 @@ torch = {version = "2.0.0", optional = true, source = "torch"}
 [tool.poetry.group.dev.dependencies]
 # These are all dev dependencies. They're already included in the container and we don't want to
 # overwrite those versions, we just want types and auto completion in this project.
-langkit = {url = "https://whypy.s3.us-west-2.amazonaws.com/langkit-0.0.47-py3-none-any.whl"}
+langkit = {url = "https://whypy.s3.us-west-2.amazonaws.com/langkit-0.0.69-py3-none-any.whl"}
 whylogs-container-types = {url = "https://whypy.s3.us-west-2.amazonaws.com/whylogs_container_types-0.4.11-py3-none-any.whl"}
-whylogs-container-client = "1.0.3"
+whylogs-container-client = "1.0.4"
 pandas = "1.3.5"
 
 pyright = "^1.1.347"

diff --git a/examples/custom_model/whylogs_config/config.py b/examples/custom_model/whylogs_config/config.py
@@ -1,9 +1,9 @@
+from functools import cache
 from typing import Any, Dict, List, Mapping, Optional
 
 import pandas as pd
 import spacy
 from presidio_analyzer import AnalyzerEngine, RecognizerResult
-from presidio_analyzer.nlp_engine import TransformersNlpEngine
 from presidio_anonymizer import AnonymizerEngine
 from whylogs_container_types import ContainerConfiguration, LangkitOptions
 
@@ -13,25 +13,27 @@
 from langkit.metrics.library import lib
 
 
+@cache
+def get_analyzer() -> AnalyzerEngine:
+    return AnalyzerEngine()
+
+
+@cache
+def get_anonymizer() -> AnonymizerEngine:
+    return AnonymizerEngine()
+
+
 def custom_presidio_metric(input_name: str) -> MetricCreator:
-    # Define which transformers model to use
-    model_config = [
-        {
-            "lang_code": "en",
-            "model_name": {
-                "spacy": "en_core_web_sm",  # use a small spaCy model for lemmas, tokens etc.
-                "transformers": "dslim/bert-base-NER",
-            },
-        }
-    ]
-    nlp_engine = TransformersNlpEngine(models=model_config)
-    analyzer = AnalyzerEngine(nlp_engine=nlp_engine)
-    anonymizer = AnonymizerEngine()
+    def cache_assets():
+        spacy.load("en_core_web_lg")
 
     def init():
-        spacy.load("en_core_web_sm")
+        get_analyzer()
+        get_anonymizer()
 
     def udf(text: pd.DataFrame) -> MultiMetricResult:
+        analyzer = get_analyzer()
+        anonymizer = get_anonymizer()
         entity_types = {
             "PHONE_NUMBER": f"{input_name}.pii.phone_number",
             "EMAIL_ADDRESS": f"{input_name}.pii.email_address",
@@ -83,16 +85,10 @@ def udf(text: pd.DataFrame) -> MultiMetricResult:
         f"{input_name}.pii.credit_card",
         f"{input_name}.pii.anonymized",
     ]
-    return lambda: MultiMetric(names=metric_names, input_name=input_name, evaluate=udf, init=init)
+    return lambda: MultiMetric(names=metric_names, input_name=input_name, evaluate=udf, init=init, cache_assets=cache_assets)
 
 
 class MyCallback(Callback):
-    def post_evaluation(self, metric_results: Mapping[str, MetricResult]) -> None:
-        """
-        This method is called right after all of the metrics run.
-        """
-        pass
-
     def post_validation(
         self,
         df: pd.DataFrame,
@@ -142,14 +138,14 @@ def get_config() -> ContainerConfiguration:
     wf = EvaluationWorkflow(metrics=[custom_presidio_metric("prompt")])
 
     # If you want, you can run this file directly to see the output of the workflow
-    # df = pd.DataFrame(
-    #     {
-    #         "prompt": [
-    #             "Hey! Here is my phone number: 555-555-5555, and my email is [email protected]. And my friend's email is [email protected]",
-    #             "no pii here",
-    #         ],
-    #         "response": ["YOINK", "good job"],
-    #     }
-    # )
-    # result = wf.evaluate(df)
-    # print(result.features.transpose())
+    df = pd.DataFrame(
+        {
+            "prompt": [
+                "Hey! Here is my phone number: 555-555-5555, and my email is [email protected]. And my friend's email is [email protected]",
+                "no pii here",
+            ],
+            "response": ["YOINK", "good job"],
+        }
+    )
+    result = wf.run(df)
+    print(result.metrics.transpose())  # type: ignore
diff --git a/examples/no_configuration/Makefile b/examples/no_configuration/Makefile
@@ -1,7 +1,7 @@
 .PHONY: help requirements build run all clean lint lint-fix format format-fix fix test pip-install-python-client pull install
 
 CONTAINER_NAME = langkit_example_configure_container_python
-DOCKER_IMAGE = whylabs/whylogs:py-llm-1.0.3
+DOCKER_IMAGE = whylabs/whylogs:py-llm-1.0.4
 
 
 all: build

diff --git a/examples/no_configuration/poetry.lock b/examples/no_configuration/poetry.lock
diff --git a/examples/no_configuration/pyproject.toml b/examples/no_configuration/pyproject.toml
@@ -15,7 +15,7 @@ python = "^3.10"
 whylogs = "*"
 langkit = {url = "https://whypy.s3.us-west-2.amazonaws.com/langkit-0.0.47-py3-none-any.whl"}
 whylogs-container-types = {url = "https://whypy.s3.us-west-2.amazonaws.com/whylogs_container_types-0.4.11-py3-none-any.whl"}
-whylogs-container-client = "1.0.3"
+whylogs-container-client = "1.0.4"
 pandas = "1.3.5"
 
 pyright = "^1.1.347"

diff --git a/examples/s3_configuration/Makefile b/examples/s3_configuration/Makefile
@@ -1,7 +1,7 @@
 .PHONY: help requirements build run all clean lint lint-fix format format-fix fix test pip-install-python-client pull install
 
 CONTAINER_NAME = langkit_example_s3_sync_configure
-DOCKER_IMAGE = whylabs/whylogs:py-llm-1.0.3
+DOCKER_IMAGE = whylabs/whylogs:py-llm-1.0.4
 
 
 all: build

diff --git a/examples/s3_configuration/README.md b/examples/s3_configuration/README.md
@@ -29,19 +29,21 @@ DEFAULT_WHYLABS_DATASET_CADENCE=DAILY
 # this example shows how it works without custom config at startup time.
 FAIL_STARTUP_WITHOUT_CONFIG=False
 
-
 # These are the s3 related variables
 S3_CONFIG_SYNC=True
 S3_CONFIG_BUCKET_NAME=<bucket-name>
 S3_CONFIG_PREFIX=bucket/prefix/path/
 S3_CONFIG_SYNC_CADENCE=M
-S3_CONFIG_SYNC_INTERVAL=10  # How often to check for new policies. 15 minutes by default if you omit this
+# How often to check for new policies. 15 minutes by default if you omit this
+S3_CONFIG_SYNC_INTERVAL=10
 
 # We use the AWS Python SDK (boto3) to access s3. It checks the environment for certain
 # AWS variables to determine auth. These have to be available in the container.
 AWS_ACCESS_KEY_ID=....
 AWS_SECRET_ACCESS_KEY=....
 AWS_SESSION_TOKEN=...
+# Or use roles
+S3_CONFIG_SYNC_ROLE_ARN=
 ```
 
 Now you can run standard langkit container and send requests to it.

diff --git a/examples/s3_configuration/poetry.lock b/examples/s3_configuration/poetry.lock