Skip to content

Commit

Permalink
Merge pull request #264 from whylabs/separate
Browse files Browse the repository at this point in the history
Separate
  • Loading branch information
naddeoa authored Mar 13, 2024
2 parents 312a5b9 + 05305e6 commit 6a05144
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.0.91
current_version = 0.0.92
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
serialize =
Expand Down
5 changes: 5 additions & 0 deletions langkit/core/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,11 @@ def run(self, data: Union[pd.DataFrame, Row, Dict[str, str]]) -> EvaluationResul
metric_times: List[Tuple[str, float]] = []

for metric in self.metrics.metrics:
# check that the dataframe has the metric.input_name present, or else skip
if metric.input_name not in df.columns:
logger.debug(f"Skipping metric {metric} because {metric.input_name} is not present in the input dataframe")
continue

metric_start = time.perf_counter()
if isinstance(metric, SingleMetric):
result = metric.evaluate(df)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langkit"
version = "0.0.91"
version = "0.0.92"
description = "A language toolkit for monitoring LLM interactions"
authors = ["WhyLabs.ai <[email protected]>"]
homepage = "https://docs.whylabs.ai/docs/large-language-model-monitoring"
Expand Down
50 changes: 50 additions & 0 deletions tests/langkit/metrics/test_workflow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from typing import List

from langkit.core.workflow import EvaluationWorkflow
from langkit.metrics.library import lib


def test_just_prompt():
wf = EvaluationWorkflow(metrics=[lib.presets.recommended()])
result = wf.run({"prompt": "hi"})
metrics = result.metrics

metric_names: List[str] = metrics.columns.tolist() # pyright: ignore[reportUnknownMemberType]

assert metric_names == [
"prompt.pii.phone_number",
"prompt.pii.email_address",
"prompt.pii.credit_card",
"prompt.pii.us_ssn",
"prompt.pii.us_bank_number",
"prompt.pii.redacted",
"prompt.stats.token_count",
"prompt.stats.char_count",
"prompt.similarity.injection",
"prompt.similarity.jailbreak",
"id",
]


def test_just_response():
wf = EvaluationWorkflow(metrics=[lib.presets.recommended()])
result = wf.run({"response": "I'm doing great!"})
metrics = result.metrics

metric_names: List[str] = metrics.columns.tolist() # pyright: ignore[reportUnknownMemberType]

assert metric_names == [
"response.pii.phone_number",
"response.pii.email_address",
"response.pii.credit_card",
"response.pii.us_ssn",
"response.pii.us_bank_number",
"response.pii.redacted",
"response.stats.token_count",
"response.stats.char_count",
"response.stats.flesch_reading_ease",
"response.sentiment.sentiment_score",
"response.toxicity.toxicity_score",
"response.similarity.refusal",
"id",
]

0 comments on commit 6a05144

Please sign in to comment.