Skip to content

Commit

Permalink
Merge pull request #284 from whylabs/model-perf
Browse files Browse the repository at this point in the history
Model perf
  • Loading branch information
naddeoa authored Apr 7, 2024
2 parents 78fcbdb + 318aaec commit 465a609
Show file tree
Hide file tree
Showing 10 changed files with 391 additions and 27 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.0.28.dev0
current_version = 0.0.28.dev1
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<build>\d+))?
serialize =
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ jobs:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Run test
- name: Run cache constraint test
env:
WHYLABS_API_KEY: ${{ secrets.WHYLABS_API_KEY }}
run: make test-cache-constraint
7 changes: 6 additions & 1 deletion Dockerfile.cache_test
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ WORKDIR /opt/whylogs-container
RUN chown -R whylabs:whylabs /opt/whylogs-container
USER whylabs

ENV LLM_CONTAINER=True
ENV CONTAINER_CACHE_BASE=/opt/whylogs-container/.cache
ENV HF_HOME=$CONTAINER_CACHE_BASE/hf_home/
ENV NLTK_DATA=$CONTAINER_CACHE_BASE/nltk_data/
Expand Down Expand Up @@ -38,9 +37,15 @@ RUN poetry config virtualenvs.in-project true
RUN poetry install --no-root --extras "all" --without dev
RUN rm -rf .venv/lib/python3.10/site-packages/pandas/tests # Pandas deploys a ton of tests to pypi

ENV TRANSFORMERS_VERBOSITY=debug

copy ./langkit ./langkit
RUN bash -c "source .venv/bin/activate; python -m langkit.scripts.langkit_cache"
RUN find $CONTAINER_CACHE_BASE/

# This step will fail if any network requests happen
ENV TRANSFORMERS_OFFLINE=1
ENV HF_DATASETS_OFFLINE=1
ENV HF_HUB_OFFLINE=1
RUN --network=none bash -c "source .venv/bin/activate; python -m langkit.scripts.langkit_cache --skip-downloads"

2 changes: 1 addition & 1 deletion langkit/asset_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def _get_asset_path(asset_id: str, tag: str = "0") -> AssetPath:
asset_id=asset_id,
tag=tag,
zip_path=f"{LANGKIT_CACHE}/assets/{asset_id}/{tag}/{asset_id}.zip",
extract_path=f"{LANGKIT_CACHE}/assets/{asset_id}/{tag}/{asset_id}/",
extract_path=f"{LANGKIT_CACHE}/assets/{asset_id}/{tag}/{asset_id}",
)


Expand Down
55 changes: 38 additions & 17 deletions langkit/metrics/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,13 @@ def all(prompt: bool = True, response: bool = True) -> MetricCreator:
from langkit.metrics.text_statistics import prompt_textstat_metric, response_textstat_metric
from langkit.metrics.themes.themes import prompt_jailbreak_similarity_metric, response_refusal_similarity_metric
from langkit.metrics.token import prompt_token_metric, response_token_metric
from langkit.metrics.toxicity import prompt_toxicity_metric, response_toxicity_metric

prompt_metrics = [
prompt_textstat_metric,
prompt_token_metric,
prompt_regex_metric,
prompt_sentiment_polarity,
prompt_toxicity_metric,
lib.prompt.toxicity(),
prompt_response_input_output_similarity_metric,
prompt_injections_metric,
prompt_jailbreak_similarity_metric,
Expand All @@ -38,7 +37,7 @@ def all(prompt: bool = True, response: bool = True) -> MetricCreator:
response_sentiment_polarity,
response_refusal_similarity_metric,
response_presidio_pii_metric,
response_toxicity_metric,
lib.response.toxicity(),
lib.response.topics.medicine(),
]

Expand Down Expand Up @@ -119,7 +118,7 @@ def toxicity_score() -> MetricCreator:
Analyze the input for toxicity. The output of this metric ranges from 0 to 1, where 0 indicates
non-toxic and 1 indicates toxic.
"""
from langkit.metrics.toxicity import prompt_toxicity_metric
from langkit.metrics.toxicity_onnx import prompt_toxicity_metric

return prompt_toxicity_metric

Expand Down Expand Up @@ -282,20 +281,31 @@ def sentiment_score() -> MetricCreator:
return prompt_sentiment_polarity

class topics:
def __init__(self, topics: List[str], hypothesis_template: Optional[str] = None):
def __init__(self, topics: List[str], hypothesis_template: Optional[str] = None, onnx: bool = True):
self.topics = topics
self.hypothesis_template = hypothesis_template
self.onnx = onnx

def __call__(self) -> MetricCreator:
from langkit.metrics.topic import topic_metric
if self.onnx:
from langkit.metrics.topic_onnx import topic_metric

return partial(topic_metric, "prompt", self.topics, self.hypothesis_template)
return partial(topic_metric, "prompt", self.topics, self.hypothesis_template)
else:
from langkit.metrics.topic import topic_metric

return partial(topic_metric, "prompt", self.topics, self.hypothesis_template)

@staticmethod
def medicine() -> MetricCreator:
from langkit.metrics.topic import topic_metric
def medicine(onnx: bool = False) -> MetricCreator:
if onnx:
from langkit.metrics.topic_onnx import topic_metric

return lambda: topic_metric("prompt", ["medicine"])
else:
from langkit.metrics.topic_onnx import topic_metric

return lambda: topic_metric("prompt", ["medicine"])
return lambda: topic_metric("prompt", ["medicine"])

class response:
@staticmethod
Expand Down Expand Up @@ -326,7 +336,7 @@ def toxicity_score() -> MetricCreator:
Analyze the toxicity of the response. The output of this metric ranges from 0 to 1, where 0
indicates a non-toxic response and 1 indicates a toxic response.
"""
from langkit.metrics.toxicity import response_toxicity_metric
from langkit.metrics.toxicity_onnx import response_toxicity_metric

return response_toxicity_metric

Expand Down Expand Up @@ -486,17 +496,28 @@ def refusal() -> MetricCreator:
return response_refusal_similarity_metric

class topics:
def __init__(self, topics: List[str], hypothesis_template: Optional[str] = None):
def __init__(self, topics: List[str], hypothesis_template: Optional[str] = None, onnx: bool = True):
self.topics = topics
self.hypothesis_template = hypothesis_template
self.onnx = onnx

def __call__(self) -> MetricCreator:
from langkit.metrics.topic import topic_metric
if self.onnx:
from langkit.metrics.topic_onnx import topic_metric

return partial(topic_metric, "response", self.topics, self.hypothesis_template)
return partial(topic_metric, "response", self.topics, self.hypothesis_template)
else:
from langkit.metrics.topic import topic_metric

return partial(topic_metric, "response", self.topics, self.hypothesis_template)

@staticmethod
def medicine() -> MetricCreator:
from langkit.metrics.topic import topic_metric
def medicine(onnx: bool = False) -> MetricCreator:
if onnx:
from langkit.metrics.topic_onnx import topic_metric

return partial(topic_metric, "response", ["medicine"])
else:
from langkit.metrics.topic_onnx import topic_metric

return partial(topic_metric, "response", ["medicine"])
return partial(topic_metric, "response", ["medicine"])
152 changes: 152 additions & 0 deletions langkit/metrics/topic_onnx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# pyright: reportUnknownMemberType=none
# pyright: reportUnknownVariableType=none
# pyright: reportUnknownLambdaType=none

import os
from dataclasses import dataclass
from functools import lru_cache, partial
from typing import List, Optional, TypedDict

import pandas as pd
import torch
from optimum.modeling_base import PreTrainedModel
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer, Pipeline, PreTrainedTokenizerBase, pipeline # type: ignore

from langkit.core.metric import MetricCreator, MultiMetric, MultiMetricResult, UdfInput

__default_topics = [
"medicine",
"economy",
"technology",
"entertainment",
]

_hypothesis_template = "This example is about {}"

_model = "MoritzLaurer/xtremedistil-l6-h256-zeroshot-v1.1-all-33"
_revision = "dea69e79cd6063916d08b883ea8a3c1823fd10b4"


def _download_assets():
ORTModelForSequenceClassification.from_pretrained(
_model,
subfolder="onnx",
file_name="model.onnx",
revision=_revision,
export=False,
)
AutoTokenizer.from_pretrained(_model, revision=_revision)


def _get_tokenizer() -> PreTrainedTokenizerBase:
return AutoTokenizer.from_pretrained(_model, revision=_revision, local_files_only=True)


def _get_model() -> PreTrainedModel:
# return ORTModelForSequenceClassification.from_pretrained(
# _model,
# subfolder="onnx",
# file_name="model.onnx",
# export=False,
# revision=_revision,
# local_files_only=True,
# )
# Optimum doesn't support offline mode https://github.com/huggingface/optimum/issues/1796
# workaround for now is to reference the actual model path after caching it. Uncomment the above code when the issue is resolved

model_name = _model.replace("/", "--")
home_dir = os.path.expanduser("~")
base = os.environ.get("HF_HOME", os.path.join(home_dir, ".cache/huggingface"))
model_path = f"{base}/hub/models--{model_name }/snapshots/{_revision}"
return ORTModelForSequenceClassification.from_pretrained(
model_path,
file_name="onnx/model.onnx",
export=False,
revision=_revision,
local_files_only=True,
)


@lru_cache
def _get_classifier() -> Pipeline:
return pipeline(
"zero-shot-classification",
model=_get_model(), # pyright: ignore[reportArgumentType]
tokenizer=_get_tokenizer(), # pyright: ignore[reportArgumentType]
truncation=True,
device="cuda" if torch.cuda.is_available() else "cpu",
)


class ClassificationResults(TypedDict):
sequence: str
labels: List[str]
scores: List[float]


def __get_scores_per_label(
text: List[str], topics: List[str], hypothesis_template: str = _hypothesis_template, multi_label: bool = True
) -> List[ClassificationResults]:
if not text:
return []

classifier = _get_classifier()
result: List[ClassificationResults] = classifier(text, topics, hypothesis_template=hypothesis_template, multi_label=multi_label) # type: ignore
return result


def _sanitize_metric_name(topic: str) -> str:
"""
sanitize a metric name created from a topic. Replace white space with underscores, etc.
"""
return topic.replace(" ", "_").lower()


def topic_metric(input_name: str, topics: List[str], hypothesis_template: Optional[str] = None) -> MultiMetric:
hypothesis_template = hypothesis_template or _hypothesis_template

def udf(text: pd.DataFrame) -> MultiMetricResult:
value: List[str] = list(UdfInput(text).iter_column_rows(input_name))
results = __get_scores_per_label(value, topics=topics, hypothesis_template=hypothesis_template)

all_metrics: List[List[float]] = [[] for _ in topics]
for result in results:
# Map each topic to its score in the current result
topic_to_score = {label: score for label, score in zip(result["labels"], result["scores"])}
# For each topic, append the score to the corresponding list in all_metrics
for i, topic in enumerate(topics):
all_metrics[i].append(topic_to_score[topic]) # Append list of score for the topic

return MultiMetricResult(metrics=all_metrics)

def cache_assets():
_download_assets()

def init():
_get_classifier()

metric_names = [f"{input_name}.topics.{_sanitize_metric_name(topic)}" for topic in topics]
return MultiMetric(names=metric_names, input_names=[input_name], evaluate=udf, cache_assets=cache_assets, init=init)


prompt_topic_module = partial(topic_metric, "prompt", __default_topics, _hypothesis_template)
response_topic_module = partial(topic_metric, "response", __default_topics, _hypothesis_template)
prompt_response_topic_module = [prompt_topic_module, response_topic_module, _hypothesis_template]


@dataclass
class CustomTopicModules:
prompt_topic_module: MetricCreator
response_topic_module: MetricCreator
prompt_response_topic_module: MetricCreator


def get_custom_topic_modules(topics: List[str], template: str = _hypothesis_template) -> CustomTopicModules:
prompt_topic_module = partial(topic_metric, "prompt", topics, template)
response_topic_module = partial(topic_metric, "response", topics, template)
return CustomTopicModules(
prompt_topic_module=prompt_topic_module,
response_topic_module=response_topic_module,
prompt_response_topic_module=[prompt_topic_module, response_topic_module],
)
81 changes: 81 additions & 0 deletions langkit/metrics/toxicity_onnx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# pyright: reportUnknownMemberType=none
# pyright: reportUnknownVariableType=none
# pyright: reportUnknownLambdaType=none
import os
from functools import lru_cache, partial
from typing import List, cast

import numpy as np
import onnxruntime
import pandas as pd
from transformers import (
AutoTokenizer,
PreTrainedTokenizerBase,
)

from langkit.asset_downloader import get_asset
from langkit.core.metric import Metric, SingleMetric, SingleMetricResult, UdfInput
from langkit.onnx_encoder import TransformerModel


def __toxicity(tokenizer: PreTrainedTokenizerBase, session: onnxruntime.InferenceSession, max_length: int, text: List[str]) -> List[float]:
max_length_in_chars = tokenizer.model_max_length * 5
truncated_text = [content[:max_length_in_chars] for content in text]
inputs = tokenizer(truncated_text, return_tensors="pt", padding=True, truncation=True)
onnx_inputs = {k: v.numpy() for k, v in inputs.items() if k in ["input_ids", "attention_mask"]}
onnx_output_logits = session.run(None, onnx_inputs)[0]

# Apply softmax to convert logits into probabilities
probabilities = np.exp(onnx_output_logits) / np.sum(np.exp(onnx_output_logits), axis=1, keepdims=True) # pyright: ignore[reportUnknownArgumentType]
labels = ["non-toxic", "toxic"]
# Find the index of the highest probability to determine the predicted label
predicted_label_idx = np.argmax(probabilities, axis=1)
predicted_labels: List[str] = [labels[idx] for idx in predicted_label_idx]
predicted_scores: List[float] = [prob[idx] for prob, idx in zip(probabilities, predicted_label_idx)]
results = [{"label": label, "score": score} for label, score in zip(predicted_labels, predicted_scores)]
return [result["score"] if result["label"] == "toxic" else 1.0 - result["score"] for result in results] # type: ignore


def _download_assets():
name, tag = TransformerModel.ToxicCommentModel.value
return get_asset(name, tag)


@lru_cache
def _get_tokenizer() -> PreTrainedTokenizerBase:
return AutoTokenizer.from_pretrained(_download_assets())


@lru_cache
def _get_session() -> onnxruntime.InferenceSession:
downloaded_path = _download_assets()
onnx_model_path = os.path.join(downloaded_path, "model.onnx")
print(f"Loading ONNX model from {onnx_model_path}")
return onnxruntime.InferenceSession(onnx_model_path, providers=["CPUExecutionProvider"])


def toxicity_metric(column_name: str) -> Metric:
def cache_assets():
_download_assets()

def init():
_get_session()
_get_tokenizer()

def udf(text: pd.DataFrame) -> SingleMetricResult:
_tokenizer = _get_tokenizer()
_session = _get_session()

col = list(UdfInput(text).iter_column_rows(column_name))
max_length = cast(int, _tokenizer.model_max_length)
metrics = __toxicity(_tokenizer, _session, max_length, col)
return SingleMetricResult(metrics=metrics)

return SingleMetric(
name=f"{column_name}.toxicity.toxicity_score", input_names=[column_name], evaluate=udf, init=init, cache_assets=cache_assets
)


prompt_toxicity_metric = partial(toxicity_metric, "prompt")
response_toxicity_metric = partial(toxicity_metric, "response")
prompt_response_toxicity_module = [prompt_toxicity_metric, response_toxicity_metric]
Loading

0 comments on commit 465a609

Please sign in to comment.