From 7f01ff72f8abc500e0fd0eca6df9aa5c86675fe1 Mon Sep 17 00:00:00 2001 From: Anthony Naddeo Date: Tue, 12 Mar 2024 19:46:01 -0700 Subject: [PATCH 1/3] Enable profiling prompt/response separately. --- langkit/core/workflow.py | 5 +++ tests/langkit/metrics/test_workflow.py | 50 ++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 tests/langkit/metrics/test_workflow.py diff --git a/langkit/core/workflow.py b/langkit/core/workflow.py index 22f007c..6fa3bd5 100644 --- a/langkit/core/workflow.py +++ b/langkit/core/workflow.py @@ -194,6 +194,11 @@ def run(self, data: Union[pd.DataFrame, Row, Dict[str, str]]) -> EvaluationResul metric_times: List[Tuple[str, float]] = [] for metric in self.metrics.metrics: + # check that the dataframe has the metric.input_name present, or else skip + if metric.input_name not in df.columns: + logger.warning(f"Skipping metric {metric} because {metric.input_name} is not present in the input dataframe") + continue + metric_start = time.perf_counter() if isinstance(metric, SingleMetric): result = metric.evaluate(df) diff --git a/tests/langkit/metrics/test_workflow.py b/tests/langkit/metrics/test_workflow.py new file mode 100644 index 0000000..8fe5d90 --- /dev/null +++ b/tests/langkit/metrics/test_workflow.py @@ -0,0 +1,50 @@ +from typing import List + +from langkit.core.workflow import EvaluationWorkflow +from langkit.metrics.library import lib + + +def test_just_prompt(): + wf = EvaluationWorkflow(metrics=[lib.presets.recommended()]) + result = wf.run({"prompt": "hi"}) + metrics = result.metrics + + metric_names: List[str] = metrics.columns.tolist() # pyright: ignore[reportUnknownMemberType] + + assert metric_names == [ + "prompt.pii.phone_number", + "prompt.pii.email_address", + "prompt.pii.credit_card", + "prompt.pii.us_ssn", + "prompt.pii.us_bank_number", + "prompt.pii.redacted", + "prompt.stats.token_count", + "prompt.stats.char_count", + "prompt.similarity.injection", + "prompt.similarity.jailbreak", + "id", + ] + + +def test_just_response(): + wf = EvaluationWorkflow(metrics=[lib.presets.recommended()]) + result = wf.run({"response": "I'm doing great!"}) + metrics = result.metrics + + metric_names: List[str] = metrics.columns.tolist() # pyright: ignore[reportUnknownMemberType] + + assert metric_names == [ + "response.pii.phone_number", + "response.pii.email_address", + "response.pii.credit_card", + "response.pii.us_ssn", + "response.pii.us_bank_number", + "response.pii.redacted", + "response.stats.token_count", + "response.stats.char_count", + "response.stats.flesch_reading_ease", + "response.sentiment.sentiment_score", + "response.toxicity.toxicity_score", + "response.similarity.refusal", + "id", + ] From 039102d4f98bd7b9bf78fdb1659ad1fe47f76d9d Mon Sep 17 00:00:00 2001 From: Anthony Naddeo Date: Tue, 12 Mar 2024 19:46:37 -0700 Subject: [PATCH 2/3] bumping for release --- .bumpversion.cfg | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 341af5d..49bb7a8 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.0.91 +current_version = 0.0.92 tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? serialize = diff --git a/pyproject.toml b/pyproject.toml index 236f01e..d712d2e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langkit" -version = "0.0.91" +version = "0.0.92" description = "A language toolkit for monitoring LLM interactions" authors = ["WhyLabs.ai "] homepage = "https://docs.whylabs.ai/docs/large-language-model-monitoring" From 05305e6b4320e9482d450402b13f996d4219b744 Mon Sep 17 00:00:00 2001 From: Anthony Naddeo Date: Tue, 12 Mar 2024 19:51:11 -0700 Subject: [PATCH 3/3] Reduce log message level for skipped metrics Too verbose at warning for what will be a regular occurance. --- langkit/core/workflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/langkit/core/workflow.py b/langkit/core/workflow.py index 6fa3bd5..0aa91a2 100644 --- a/langkit/core/workflow.py +++ b/langkit/core/workflow.py @@ -196,7 +196,7 @@ def run(self, data: Union[pd.DataFrame, Row, Dict[str, str]]) -> EvaluationResul for metric in self.metrics.metrics: # check that the dataframe has the metric.input_name present, or else skip if metric.input_name not in df.columns: - logger.warning(f"Skipping metric {metric} because {metric.input_name} is not present in the input dataframe") + logger.debug(f"Skipping metric {metric} because {metric.input_name} is not present in the input dataframe") continue metric_start = time.perf_counter()