Merge pull request #284 from whylabs/model-perf

Model perf
whylabs · Apr 7, 2024 · 465a609 · 465a609
2 parents 78fcbdb + 318aaec
commit 465a609
Show file tree

Hide file tree

Showing 10 changed files with 391 additions and 27 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.0.28.dev0
+current_version = 0.0.28.dev1
 tag = False
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<build>\d+))?
 serialize = 

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -83,7 +83,7 @@ jobs:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
 
-      - name: Run test
+      - name: Run cache constraint test
         env:
           WHYLABS_API_KEY: ${{ secrets.WHYLABS_API_KEY }}
         run: make test-cache-constraint
diff --git a/Dockerfile.cache_test b/Dockerfile.cache_test
@@ -4,7 +4,6 @@ WORKDIR /opt/whylogs-container
 RUN chown -R whylabs:whylabs /opt/whylogs-container
 USER whylabs
 
-ENV LLM_CONTAINER=True
 ENV CONTAINER_CACHE_BASE=/opt/whylogs-container/.cache
 ENV HF_HOME=$CONTAINER_CACHE_BASE/hf_home/
 ENV NLTK_DATA=$CONTAINER_CACHE_BASE/nltk_data/
@@ -38,9 +37,15 @@ RUN poetry config virtualenvs.in-project true
 RUN poetry install --no-root --extras "all" --without dev
 RUN rm -rf .venv/lib/python3.10/site-packages/pandas/tests # Pandas deploys a ton of tests to pypi
 
+ENV TRANSFORMERS_VERBOSITY=debug
 
 copy ./langkit ./langkit
 RUN bash -c "source .venv/bin/activate; python -m langkit.scripts.langkit_cache"
+RUN find  $CONTAINER_CACHE_BASE/
+
 # This step will fail if any network requests happen
+ENV TRANSFORMERS_OFFLINE=1
+ENV HF_DATASETS_OFFLINE=1
+ENV HF_HUB_OFFLINE=1
 RUN --network=none bash -c "source .venv/bin/activate; python -m langkit.scripts.langkit_cache --skip-downloads"
 
diff --git a/langkit/asset_downloader.py b/langkit/asset_downloader.py
@@ -36,7 +36,7 @@ def _get_asset_path(asset_id: str, tag: str = "0") -> AssetPath:
         asset_id=asset_id,
         tag=tag,
         zip_path=f"{LANGKIT_CACHE}/assets/{asset_id}/{tag}/{asset_id}.zip",
-        extract_path=f"{LANGKIT_CACHE}/assets/{asset_id}/{tag}/{asset_id}/",
+        extract_path=f"{LANGKIT_CACHE}/assets/{asset_id}/{tag}/{asset_id}",
     )
 
 

diff --git a/langkit/metrics/library.py b/langkit/metrics/library.py
@@ -16,14 +16,13 @@ def all(prompt: bool = True, response: bool = True) -> MetricCreator:
             from langkit.metrics.text_statistics import prompt_textstat_metric, response_textstat_metric
             from langkit.metrics.themes.themes import prompt_jailbreak_similarity_metric, response_refusal_similarity_metric
             from langkit.metrics.token import prompt_token_metric, response_token_metric
-            from langkit.metrics.toxicity import prompt_toxicity_metric, response_toxicity_metric
 
             prompt_metrics = [
                 prompt_textstat_metric,
                 prompt_token_metric,
                 prompt_regex_metric,
                 prompt_sentiment_polarity,
-                prompt_toxicity_metric,
+                lib.prompt.toxicity(),
                 prompt_response_input_output_similarity_metric,
                 prompt_injections_metric,
                 prompt_jailbreak_similarity_metric,
@@ -38,7 +37,7 @@ def all(prompt: bool = True, response: bool = True) -> MetricCreator:
                 response_sentiment_polarity,
                 response_refusal_similarity_metric,
                 response_presidio_pii_metric,
-                response_toxicity_metric,
+                lib.response.toxicity(),
                 lib.response.topics.medicine(),
             ]
 
@@ -119,7 +118,7 @@ def toxicity_score() -> MetricCreator:
                 Analyze the input for toxicity. The output of this metric ranges from 0 to 1, where 0 indicates
                 non-toxic and 1 indicates toxic.
                 """
-                from langkit.metrics.toxicity import prompt_toxicity_metric
+                from langkit.metrics.toxicity_onnx import prompt_toxicity_metric
 
                 return prompt_toxicity_metric
 
@@ -282,20 +281,31 @@ def sentiment_score() -> MetricCreator:
                 return prompt_sentiment_polarity
 
         class topics:
-            def __init__(self, topics: List[str], hypothesis_template: Optional[str] = None):
+            def __init__(self, topics: List[str], hypothesis_template: Optional[str] = None, onnx: bool = True):
                 self.topics = topics
                 self.hypothesis_template = hypothesis_template
+                self.onnx = onnx
 
             def __call__(self) -> MetricCreator:
-                from langkit.metrics.topic import topic_metric
+                if self.onnx:
+                    from langkit.metrics.topic_onnx import topic_metric
 
-                return partial(topic_metric, "prompt", self.topics, self.hypothesis_template)
+                    return partial(topic_metric, "prompt", self.topics, self.hypothesis_template)
+                else:
+                    from langkit.metrics.topic import topic_metric
+
+                    return partial(topic_metric, "prompt", self.topics, self.hypothesis_template)
 
             @staticmethod
-            def medicine() -> MetricCreator:
-                from langkit.metrics.topic import topic_metric
+            def medicine(onnx: bool = False) -> MetricCreator:
+                if onnx:
+                    from langkit.metrics.topic_onnx import topic_metric
+
+                    return lambda: topic_metric("prompt", ["medicine"])
+                else:
+                    from langkit.metrics.topic_onnx import topic_metric
 
-                return lambda: topic_metric("prompt", ["medicine"])
+                    return lambda: topic_metric("prompt", ["medicine"])
 
     class response:
         @staticmethod
@@ -326,7 +336,7 @@ def toxicity_score() -> MetricCreator:
                 Analyze the toxicity of the response. The output of this metric ranges from 0 to 1, where 0
                 indicates a non-toxic response and 1 indicates a toxic response.
                 """
-                from langkit.metrics.toxicity import response_toxicity_metric
+                from langkit.metrics.toxicity_onnx import response_toxicity_metric
 
                 return response_toxicity_metric
 
@@ -486,17 +496,28 @@ def refusal() -> MetricCreator:
                 return response_refusal_similarity_metric
 
         class topics:
-            def __init__(self, topics: List[str], hypothesis_template: Optional[str] = None):
+            def __init__(self, topics: List[str], hypothesis_template: Optional[str] = None, onnx: bool = True):
                 self.topics = topics
                 self.hypothesis_template = hypothesis_template
+                self.onnx = onnx
 
             def __call__(self) -> MetricCreator:
-                from langkit.metrics.topic import topic_metric
+                if self.onnx:
+                    from langkit.metrics.topic_onnx import topic_metric
 
-                return partial(topic_metric, "response", self.topics, self.hypothesis_template)
+                    return partial(topic_metric, "response", self.topics, self.hypothesis_template)
+                else:
+                    from langkit.metrics.topic import topic_metric
+
+                    return partial(topic_metric, "response", self.topics, self.hypothesis_template)
 
             @staticmethod
-            def medicine() -> MetricCreator:
-                from langkit.metrics.topic import topic_metric
+            def medicine(onnx: bool = False) -> MetricCreator:
+                if onnx:
+                    from langkit.metrics.topic_onnx import topic_metric
+
+                    return partial(topic_metric, "response", ["medicine"])
+                else:
+                    from langkit.metrics.topic_onnx import topic_metric
 
-                return partial(topic_metric, "response", ["medicine"])
+                    return partial(topic_metric, "response", ["medicine"])
diff --git a/langkit/metrics/topic_onnx.py b/langkit/metrics/topic_onnx.py
@@ -0,0 +1,152 @@
+# pyright: reportUnknownMemberType=none
+# pyright: reportUnknownVariableType=none
+# pyright: reportUnknownLambdaType=none
+
+import os
+from dataclasses import dataclass
+from functools import lru_cache, partial
+from typing import List, Optional, TypedDict
+
+import pandas as pd
+import torch
+from optimum.modeling_base import PreTrainedModel
+from optimum.onnxruntime import ORTModelForSequenceClassification
+from transformers import AutoTokenizer, Pipeline, PreTrainedTokenizerBase, pipeline  # type: ignore
+
+from langkit.core.metric import MetricCreator, MultiMetric, MultiMetricResult, UdfInput
+
+__default_topics = [
+    "medicine",
+    "economy",
+    "technology",
+    "entertainment",
+]
+
+_hypothesis_template = "This example is about {}"
+
+_model = "MoritzLaurer/xtremedistil-l6-h256-zeroshot-v1.1-all-33"
+_revision = "dea69e79cd6063916d08b883ea8a3c1823fd10b4"
+
+
+def _download_assets():
+    ORTModelForSequenceClassification.from_pretrained(
+        _model,
+        subfolder="onnx",
+        file_name="model.onnx",
+        revision=_revision,
+        export=False,
+    )
+    AutoTokenizer.from_pretrained(_model, revision=_revision)
+
+
+def _get_tokenizer() -> PreTrainedTokenizerBase:
+    return AutoTokenizer.from_pretrained(_model, revision=_revision, local_files_only=True)
+
+
+def _get_model() -> PreTrainedModel:
+    # return ORTModelForSequenceClassification.from_pretrained(
+    #     _model,
+    #     subfolder="onnx",
+    #     file_name="model.onnx",
+    #     export=False,
+    #     revision=_revision,
+    #     local_files_only=True,
+    # )
+    # Optimum doesn't support offline mode https://github.com/huggingface/optimum/issues/1796
+    # workaround for now is to reference the actual model path after caching it. Uncomment the above code when the issue is resolved
+
+    model_name = _model.replace("/", "--")
+    home_dir = os.path.expanduser("~")
+    base = os.environ.get("HF_HOME", os.path.join(home_dir, ".cache/huggingface"))
+    model_path = f"{base}/hub/models--{model_name }/snapshots/{_revision}"
+    return ORTModelForSequenceClassification.from_pretrained(
+        model_path,
+        file_name="onnx/model.onnx",
+        export=False,
+        revision=_revision,
+        local_files_only=True,
+    )
+
+
+@lru_cache
+def _get_classifier() -> Pipeline:
+    return pipeline(
+        "zero-shot-classification",
+        model=_get_model(),  # pyright: ignore[reportArgumentType]
+        tokenizer=_get_tokenizer(),  # pyright: ignore[reportArgumentType]
+        truncation=True,
+        device="cuda" if torch.cuda.is_available() else "cpu",
+    )
+
+
+class ClassificationResults(TypedDict):
+    sequence: str
+    labels: List[str]
+    scores: List[float]
+
+
+def __get_scores_per_label(
+    text: List[str], topics: List[str], hypothesis_template: str = _hypothesis_template, multi_label: bool = True
+) -> List[ClassificationResults]:
+    if not text:
+        return []
+
+    classifier = _get_classifier()
+    result: List[ClassificationResults] = classifier(text, topics, hypothesis_template=hypothesis_template, multi_label=multi_label)  # type: ignore
+    return result
+
+
+def _sanitize_metric_name(topic: str) -> str:
+    """
+    sanitize a metric name created from a topic. Replace white space with underscores, etc.
+    """
+    return topic.replace(" ", "_").lower()
+
+
+def topic_metric(input_name: str, topics: List[str], hypothesis_template: Optional[str] = None) -> MultiMetric:
+    hypothesis_template = hypothesis_template or _hypothesis_template
+
+    def udf(text: pd.DataFrame) -> MultiMetricResult:
+        value: List[str] = list(UdfInput(text).iter_column_rows(input_name))
+        results = __get_scores_per_label(value, topics=topics, hypothesis_template=hypothesis_template)
+
+        all_metrics: List[List[float]] = [[] for _ in topics]
+        for result in results:
+            # Map each topic to its score in the current result
+            topic_to_score = {label: score for label, score in zip(result["labels"], result["scores"])}
+            # For each topic, append the score to the corresponding list in all_metrics
+            for i, topic in enumerate(topics):
+                all_metrics[i].append(topic_to_score[topic])  # Append list of score for the topic
+
+        return MultiMetricResult(metrics=all_metrics)
+
+    def cache_assets():
+        _download_assets()
+
+    def init():
+        _get_classifier()
+
+    metric_names = [f"{input_name}.topics.{_sanitize_metric_name(topic)}" for topic in topics]
+    return MultiMetric(names=metric_names, input_names=[input_name], evaluate=udf, cache_assets=cache_assets, init=init)
+
+
+prompt_topic_module = partial(topic_metric, "prompt", __default_topics, _hypothesis_template)
+response_topic_module = partial(topic_metric, "response", __default_topics, _hypothesis_template)
+prompt_response_topic_module = [prompt_topic_module, response_topic_module, _hypothesis_template]
+
+
+@dataclass
+class CustomTopicModules:
+    prompt_topic_module: MetricCreator
+    response_topic_module: MetricCreator
+    prompt_response_topic_module: MetricCreator
+
+
+def get_custom_topic_modules(topics: List[str], template: str = _hypothesis_template) -> CustomTopicModules:
+    prompt_topic_module = partial(topic_metric, "prompt", topics, template)
+    response_topic_module = partial(topic_metric, "response", topics, template)
+    return CustomTopicModules(
+        prompt_topic_module=prompt_topic_module,
+        response_topic_module=response_topic_module,
+        prompt_response_topic_module=[prompt_topic_module, response_topic_module],
+    )
diff --git a/langkit/metrics/toxicity_onnx.py b/langkit/metrics/toxicity_onnx.py
@@ -0,0 +1,81 @@
+# pyright: reportUnknownMemberType=none
+# pyright: reportUnknownVariableType=none
+# pyright: reportUnknownLambdaType=none
+import os
+from functools import lru_cache, partial
+from typing import List, cast
+
+import numpy as np
+import onnxruntime
+import pandas as pd
+from transformers import (
+    AutoTokenizer,
+    PreTrainedTokenizerBase,
+)
+
+from langkit.asset_downloader import get_asset
+from langkit.core.metric import Metric, SingleMetric, SingleMetricResult, UdfInput
+from langkit.onnx_encoder import TransformerModel
+
+
+def __toxicity(tokenizer: PreTrainedTokenizerBase, session: onnxruntime.InferenceSession, max_length: int, text: List[str]) -> List[float]:
+    max_length_in_chars = tokenizer.model_max_length * 5
+    truncated_text = [content[:max_length_in_chars] for content in text]
+    inputs = tokenizer(truncated_text, return_tensors="pt", padding=True, truncation=True)
+    onnx_inputs = {k: v.numpy() for k, v in inputs.items() if k in ["input_ids", "attention_mask"]}
+    onnx_output_logits = session.run(None, onnx_inputs)[0]
+
+    # Apply softmax to convert logits into probabilities
+    probabilities = np.exp(onnx_output_logits) / np.sum(np.exp(onnx_output_logits), axis=1, keepdims=True)  # pyright: ignore[reportUnknownArgumentType]
+    labels = ["non-toxic", "toxic"]
+    # Find the index of the highest probability to determine the predicted label
+    predicted_label_idx = np.argmax(probabilities, axis=1)
+    predicted_labels: List[str] = [labels[idx] for idx in predicted_label_idx]
+    predicted_scores: List[float] = [prob[idx] for prob, idx in zip(probabilities, predicted_label_idx)]
+    results = [{"label": label, "score": score} for label, score in zip(predicted_labels, predicted_scores)]
+    return [result["score"] if result["label"] == "toxic" else 1.0 - result["score"] for result in results]  # type: ignore
+
+
+def _download_assets():
+    name, tag = TransformerModel.ToxicCommentModel.value
+    return get_asset(name, tag)
+
+
+@lru_cache
+def _get_tokenizer() -> PreTrainedTokenizerBase:
+    return AutoTokenizer.from_pretrained(_download_assets())
+
+
+@lru_cache
+def _get_session() -> onnxruntime.InferenceSession:
+    downloaded_path = _download_assets()
+    onnx_model_path = os.path.join(downloaded_path, "model.onnx")
+    print(f"Loading ONNX model from {onnx_model_path}")
+    return onnxruntime.InferenceSession(onnx_model_path, providers=["CPUExecutionProvider"])
+
+
+def toxicity_metric(column_name: str) -> Metric:
+    def cache_assets():
+        _download_assets()
+
+    def init():
+        _get_session()
+        _get_tokenizer()
+
+    def udf(text: pd.DataFrame) -> SingleMetricResult:
+        _tokenizer = _get_tokenizer()
+        _session = _get_session()
+
+        col = list(UdfInput(text).iter_column_rows(column_name))
+        max_length = cast(int, _tokenizer.model_max_length)
+        metrics = __toxicity(_tokenizer, _session, max_length, col)
+        return SingleMetricResult(metrics=metrics)
+
+    return SingleMetric(
+        name=f"{column_name}.toxicity.toxicity_score", input_names=[column_name], evaluate=udf, init=init, cache_assets=cache_assets
+    )
+
+
+prompt_toxicity_metric = partial(toxicity_metric, "prompt")
+response_toxicity_metric = partial(toxicity_metric, "response")
+prompt_response_toxicity_module = [prompt_toxicity_metric, response_toxicity_metric]