Add new examples with integ tests

whylabs · Jan 26, 2024 · 4b00a10 · 4b00a10
1 parent a2ced04
commit 4b00a10
Show file tree

Hide file tree

Showing 38 changed files with 6,521 additions and 864 deletions.
diff --git a/examples/configure-container/Dockerfile b/examples/configure-container/Dockerfile
diff --git a/examples/configure-container/test/__pycache__/__init__.cpython-310.pyc b/examples/configure-container/test/__pycache__/__init__.cpython-310.pyc
diff --git a/examples/configure-container/test/__pycache__/conftest.cpython-310-pytest-7.4.4.pyc b/examples/configure-container/test/__pycache__/conftest.cpython-310-pytest-7.4.4.pyc
diff --git a/examples/configure-container/test/__pycache__/test_container.cpython-310-pytest-7.4.4.pyc b/examples/configure-container/test/__pycache__/test_container.cpython-310-pytest-7.4.4.pyc
diff --git a/examples/configure-container/test/test_container.py b/examples/configure-container/test/test_container.py
diff --git a/examples/configure-container/whylogs_config/config.yaml b/examples/configure-container/whylogs_config/config.yaml
diff --git a/examples/configure_container_python/Dockerfile b/examples/configure_container_python/Dockerfile
@@ -1,5 +1,8 @@
-# FROM whylabs/whylogs:py-llm-1.0.1
-FROM whylabs/whylogs:py-llm-latest
+FROM whylabs/whylogs:py-llm-1.0.2.dev0
+
+# Force the container to fail if the config is not present. Safeguard for messing up the
+# build in such a way that the config is not included correctly.
+ENV FAIL_STARTUP_WITHOUT_CONFIG=True
 
 # Copy our custom config code
 COPY ./whylogs_config /opt/whylogs-container/whylogs_container/whylogs_config/

diff --git a/examples/configure_container_python/Makefile b/examples/configure_container_python/Makefile
@@ -1,6 +1,6 @@
-.PHONY: help requirements build run all clean lint lint-fix format format-fix fix test
+.PHONY: help requirements build run all clean lint lint-fix format format-fix fix test pip-install-python-client
 
-CONTAINER_NAME = langkit_configuration_example
+CONTAINER_NAME = langkit_example_configure_container_python
 
 all: build
 
@@ -38,9 +38,14 @@ format-fix: ## Fix formatting issues
 
 fix: lint-fix format-fix ## Fix all linting and formatting issues
 
+pip-install-python-client:  ## Install the latest python client from the main project
+	poetry run pip uninstall whylogs-container-client -y || true
+	poetry run pip install ../../whylogs-container-client/dist/*.whl
+
 help: ## Show this help message.
 	@echo 'usage: make [target] ...'
 	@echo
 	@echo 'targets:'
 	@egrep '^(.+)\:(.*) ##\ (.+)' ${MAKEFILE_LIST} | sed -s 's/:\(.*\)##/: ##/' | column -t -c 2 -s ':#'
 
+
diff --git a/examples/configure_container_python/poetry.lock b/examples/configure_container_python/poetry.lock
diff --git a/examples/configure_container_python/pyproject.toml b/examples/configure_container_python/pyproject.toml
@@ -1,5 +1,5 @@
 [tool.poetry]
-name =  "python_container_example"
+name =  "configure_container_python"
 version = "0.1.0"
 description = ""
 authors = ["Anthony Naddeo <[email protected]>"]
@@ -8,24 +8,20 @@ packages = [{include = "whylogs_config"}]
 
 [tool.poetry.dependencies]
 python = "^3.10"
-whylogs-container-types = {url = "https://guest-session-testing-public.s3.us-west-2.amazonaws.com/whylogs_container_types-0.4.2-py3-none-any.whl"}
+whylogs-container-client = "1.0.2.dev0"
 
 [tool.poetry.group.dev.dependencies]
 # These are all dev dependencies. They're already included in the container and we don't want to
 # overwrite those versions, we just want types and auto completion in this project.
+# whylogs-container-types = "^0.4.0"
+langkit = {url = "https://whypy.s3.us-west-2.amazonaws.com/langkit-0.0.38-py3-none-any.whl", extras = ["all"]}
+whylogs-container-types = {url = "https://whypy.s3.us-west-2.amazonaws.com/whylogs_container_types-0.4.8-py3-none-any.whl"}
 pandas = "1.3.5"
-whylogs = {version = "1.3.20", extras = ["proc"]}
-whylogs-container-client = "^1.0.1"
 
 pyright = "^1.1.347"
 ruff = "^0.1.13"
 pytest = "^7.4.4"
-
-[[tool.poetry.source]]
-name = "torch"
-url = "https://download.pytorch.org/whl/cpu"
-priority = "explicit"
-
+whylogs-container-client = "*"
 
 [build-system]
 requires = ["poetry-core"]

diff --git a/examples/configure_container_python/test/conftest.py b/examples/configure_container_python/test/conftest.py
@@ -8,7 +8,7 @@
 import whylogs_container_client.api.manage.health as Health
 from whylogs_container_client import AuthenticatedClient
 
-image_name = "langkit_configuration_example"  # from the makefile, run `make build` to build the image
+image_name = "langkit_example_configure_container_python"  # from the makefile, run `make build` to build the image
 
 T = TypeVar("T")
 
@@ -52,10 +52,6 @@ def docker(port: str) -> List[str]:
             "DEFAULT_WHYLABS_UPLOAD_CADENCE=M",
             "--env",
             "DEFAULT_WHYLABS_UPLOAD_INTERVAL=5",
-            "--env",
-            "FAIL_STARTUP_WITHOUT_CONFIG=False",
-            "--env",
-            f"PORT={port}",
             image_name,
         ]
 

diff --git a/examples/configure_container_python/test/test_container.py b/examples/configure_container_python/test/test_container.py
@@ -1,44 +1,175 @@
 import whylogs_container_client.api.llm.validate_llm as ValidateLLM
 from whylogs_container_client import AuthenticatedClient
 from whylogs_container_client.models.llm_validate_request import LLMValidateRequest
-from whylogs_container_client.models.validation_metadata import ValidationMetadata
-from whylogs_container_client.models.validation_report import ValidationReport
+from whylogs_container_client.models.validation_failure import ValidationFailure
+from whylogs_container_client.models.validation_result import ValidationResult
 
 
-def test_toxic_prompt(client: AuthenticatedClient):
-    # Validate a prompt and response pair for LLM validations
+def test_toxic_response_131(client: AuthenticatedClient):
     request = LLMValidateRequest(
-        prompt="This llm sucks and everyone who made is sucks.",
+        prompt="How do I see my settings",
+        response="You're a stupid user, it's obvious.",
+        dataset_id="model-131",
+    )
+
+    response = ValidateLLM.sync_detailed(client=client, body=request)
+
+    if not isinstance(response.parsed, ValidationResult):
+        raise Exception(f"Failed to validate data. Status code: {response.status_code}. {response.parsed}")
+
+    actual: ValidationResult = response.parsed
+
+    expected = ValidationResult(
+        report=[
+            ValidationFailure(
+                id=0,
+                metric="response.toxicity",
+                details="Value 0.9642418622970581 is above threshold 0.4",
+                value=0.9642418622970581,
+                upper_threshold=0.4,
+                lower_threshold=None,
+            )
+        ],
+    )
+
+    assert actual == expected
+
+
+def test_upper_case_letters_prompt_131(client: AuthenticatedClient):
+    request = LLMValidateRequest(
+        prompt="...",  # <1 upper case letters
         response="I'm sorry you feel that way.",
-        dataset_id="model-62",
+        dataset_id="model-131",
     )
 
-    response = ValidateLLM.sync_detailed(client=client, json_body=request)
+    response = ValidateLLM.sync_detailed(client=client, body=request)
 
-    if not isinstance(response.parsed, ValidationReport):
+    if not isinstance(response.parsed, ValidationResult):
         raise Exception(f"Failed to validate data. Status code: {response.status_code}. {response.parsed}")
 
-    report: ValidationReport = response.parsed
-
-    expected = ValidationReport(
-        failures=[
-            ValidationMetadata(
-                prompt_id="---",
-                validator_name="toxicity_validator",
-                failed_metric="toxicity_prompt",
-                value=0.9417606592178345,
-                timestamp=None,
-                is_valid=False,
+    actual: ValidationResult = response.parsed
+
+    expected = ValidationResult(
+        report=[
+            ValidationFailure(
+                id=0,
+                metric="prompt.upper_case_char_count",
+                details="Value 0 is below threshold 1",
+                value=0,
+                upper_threshold=None,
+                lower_threshold=1.0,
             )
-        ]
+        ],
     )
 
-    assert len(report.failures) == 1
+    assert actual == expected
+
+
+def test_upper_case_letters_prompt_reading_ease_response_131(client: AuthenticatedClient):
+    response = (
+        "Playing games has always been thought to be important to "
+        "the development of well-balanced and creative children; "
+        "however, what part, if any, they should play in the lives "
+        "of adults has never been researched that deeply. I believe "
+        "that playing games is every bit as important for adults "
+        "as for children. Not only is taking time out to play games "
+        "with our children and other adults valuable to building "
+        "interpersonal relationships but is also a wonderful way "
+        "to release built up tension."
+    )
+
+    request = LLMValidateRequest(
+        prompt="...",  # <1 upper case letters
+        response=response,
+        dataset_id="model-131",
+    )
+
+    response = ValidateLLM.sync_detailed(client=client, body=request)
+
+    if not isinstance(response.parsed, ValidationResult):
+        raise Exception(f"Failed to validate data. Status code: {response.status_code}. {response.parsed}")
+
+    actual: ValidationResult = response.parsed
+
+    expected = ValidationResult(
+        report=[
+            ValidationFailure(
+                id=0,
+                metric="prompt.upper_case_char_count",
+                details="Value 0 is below threshold 1",
+                value=0,
+                upper_threshold=None,
+                lower_threshold=1.0,
+            ),
+            ValidationFailure(
+                id=0,
+                metric="response.flesch_reading_ease",
+                details="Value 52.23 is below threshold 70.0",
+                value=52.23,
+                upper_threshold=None,
+                lower_threshold=70.0,
+            ),
+        ],
+    )
 
-    actual = report.failures[0]
+    assert actual == expected
+
+
+def test_prompt_sentiment_133(client: AuthenticatedClient):
+    request = LLMValidateRequest(
+        prompt="Ugh, this is way too hard...",
+        response="I'm sorry you feel that way.",
+        dataset_id="model-133",
+    )
+
+    response = ValidateLLM.sync_detailed(client=client, body=request)
+
+    if not isinstance(response.parsed, ValidationResult):
+        raise Exception(f"Failed to validate data. Status code: {response.status_code}. {response.parsed}")
+
+    actual: ValidationResult = response.parsed
+
+    expected = ValidationResult(
+        report=[
+            ValidationFailure(
+                id=0,
+                metric="prompt.sentiment_polarity",
+                details="Value -0.4215 is below threshold 0",
+                value=-0.4215,
+                upper_threshold=None,
+                lower_threshold=0.0,
+            )
+        ],
+    )
+
+    assert actual == expected
+
+
+def test_response_lower_case_133(client: AuthenticatedClient):
+    request = LLMValidateRequest(
+        prompt="Hello!",
+        response="I'M SORRY YOU FEEL THAT WAY.",
+        dataset_id="model-133",
+    )
+
+    response = ValidateLLM.sync_detailed(client=client, body=request)
+
+    if not isinstance(response.parsed, ValidationResult):
+        raise Exception(f"Failed to validate data. Status code: {response.status_code}. {response.parsed}")
+
+    actual: ValidationResult = response.parsed
+
+    expected = ValidationResult(
+        report=[
+            ValidationFailure(
+                id=0,
+                metric="response.lower_case_char_count",
+                details="Value 0 is below threshold 10",
+                value=0,
+                upper_threshold=None,
+                lower_threshold=10.0,
+            )
+        ],
+    )
 
-    assert actual.validator_name == expected.failures[0].validator_name
-    assert actual.failed_metric == expected.failures[0].failed_metric
-    assert actual.value == expected.failures[0].value
-    assert actual.timestamp is None
-    assert actual.is_valid == expected.failures[0].is_valid
+    assert actual == expected