diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 341af5d..49bb7a8 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.0.91 +current_version = 0.0.92 tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? serialize = diff --git a/langkit/core/workflow.py b/langkit/core/workflow.py index 22f007c..0aa91a2 100644 --- a/langkit/core/workflow.py +++ b/langkit/core/workflow.py @@ -194,6 +194,11 @@ def run(self, data: Union[pd.DataFrame, Row, Dict[str, str]]) -> EvaluationResul metric_times: List[Tuple[str, float]] = [] for metric in self.metrics.metrics: + # check that the dataframe has the metric.input_name present, or else skip + if metric.input_name not in df.columns: + logger.debug(f"Skipping metric {metric} because {metric.input_name} is not present in the input dataframe") + continue + metric_start = time.perf_counter() if isinstance(metric, SingleMetric): result = metric.evaluate(df) diff --git a/pyproject.toml b/pyproject.toml index 236f01e..d712d2e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langkit" -version = "0.0.91" +version = "0.0.92" description = "A language toolkit for monitoring LLM interactions" authors = ["WhyLabs.ai "] homepage = "https://docs.whylabs.ai/docs/large-language-model-monitoring" diff --git a/tests/langkit/metrics/test_workflow.py b/tests/langkit/metrics/test_workflow.py new file mode 100644 index 0000000..8fe5d90 --- /dev/null +++ b/tests/langkit/metrics/test_workflow.py @@ -0,0 +1,50 @@ +from typing import List + +from langkit.core.workflow import EvaluationWorkflow +from langkit.metrics.library import lib + + +def test_just_prompt(): + wf = EvaluationWorkflow(metrics=[lib.presets.recommended()]) + result = wf.run({"prompt": "hi"}) + metrics = result.metrics + + metric_names: List[str] = metrics.columns.tolist() # pyright: ignore[reportUnknownMemberType] + + assert metric_names == [ + "prompt.pii.phone_number", + "prompt.pii.email_address", + "prompt.pii.credit_card", + "prompt.pii.us_ssn", + "prompt.pii.us_bank_number", + "prompt.pii.redacted", + "prompt.stats.token_count", + "prompt.stats.char_count", + "prompt.similarity.injection", + "prompt.similarity.jailbreak", + "id", + ] + + +def test_just_response(): + wf = EvaluationWorkflow(metrics=[lib.presets.recommended()]) + result = wf.run({"response": "I'm doing great!"}) + metrics = result.metrics + + metric_names: List[str] = metrics.columns.tolist() # pyright: ignore[reportUnknownMemberType] + + assert metric_names == [ + "response.pii.phone_number", + "response.pii.email_address", + "response.pii.credit_card", + "response.pii.us_ssn", + "response.pii.us_bank_number", + "response.pii.redacted", + "response.stats.token_count", + "response.stats.char_count", + "response.stats.flesch_reading_ease", + "response.sentiment.sentiment_score", + "response.toxicity.toxicity_score", + "response.similarity.refusal", + "id", + ]