diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 2556f98..426909e 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.0.87 +current_version = 0.0.88 tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? serialize = diff --git a/langkit/core/workflow.py b/langkit/core/workflow.py index f3d363b..67b8705 100644 --- a/langkit/core/workflow.py +++ b/langkit/core/workflow.py @@ -178,8 +178,7 @@ def run(self, data: Dict[str, str]) -> EvaluationResult: def run(self, data: Union[pd.DataFrame, Row, Dict[str, str]]) -> EvaluationResult: start = time.perf_counter() - if not self._initialized: - self.init() + self.init() if not isinstance(data, pd.DataFrame): if not is_dict_with_strings(data): diff --git a/langkit/metrics/library.py b/langkit/metrics/library.py index e3e99b3..6788722 100644 --- a/langkit/metrics/library.py +++ b/langkit/metrics/library.py @@ -48,14 +48,14 @@ def recommended(prompt: bool = True, response: bool = True) -> MetricCreator: - prompt.pii.* - prompt.stats.token_count - - prompt.text_stat.char_count + - prompt.stats.char_count - prompt.similarity.injection - prompt.similarity.jailbreak - response.pii.* - response.stats.token_count - - response.text_stat.char_count - - response.text_stat.reading_ease + - response.stats.char_count + - response.stats.reading_ease - response.sentiment.sentiment_score - response.toxicity.toxicity_score - response.similarity.refusal @@ -64,7 +64,7 @@ def recommended(prompt: bool = True, response: bool = True) -> MetricCreator: prompt_metrics = [ lib.prompt.pii, lib.prompt.stats.token_count, - lib.prompt.text_stat.char_count, + lib.prompt.stats.char_count, lib.prompt.similarity.injection, lib.prompt.similarity.jailbreak, ] @@ -72,8 +72,8 @@ def recommended(prompt: bool = True, response: bool = True) -> MetricCreator: response_metrics = [ lib.response.pii, lib.response.stats.token_count, - lib.response.text_stat.char_count, - lib.response.text_stat.reading_ease, + lib.response.stats.char_count, + lib.response.stats.reading_ease, lib.response.sentiment.sentiment_score, lib.response.toxicity.toxicity_score, lib.response.similarity.refusal, @@ -117,11 +117,11 @@ def toxicity_score() -> MetricCreator: return prompt_toxicity_metric - class text_stat: + class stats: def __call__(self) -> MetricCreator: from langkit.metrics.text_statistics import prompt_textstat_metric - return prompt_textstat_metric + return [lib.prompt.stats.token_count, prompt_textstat_metric] @staticmethod def char_count() -> MetricCreator: @@ -171,10 +171,6 @@ def difficult_words() -> MetricCreator: return prompt_difficult_words_metric - class stats: - def __call__(self) -> MetricCreator: - return [lib.prompt.stats.token_count] - @staticmethod def token_count(tiktoken_encoding: Optional[str] = None) -> MetricCreator: """ @@ -303,11 +299,11 @@ def toxicity_score() -> MetricCreator: return response_toxicity_metric - class text_stat: + class stats: def __call__(self) -> MetricCreator: from langkit.metrics.text_statistics import response_textstat_metric - return response_textstat_metric + return [lib.response.stats.token_count, response_textstat_metric] @staticmethod def char_count() -> MetricCreator: @@ -357,10 +353,6 @@ def difficult_words() -> MetricCreator: return response_difficult_words_metric - class stats: - def __call__(self) -> MetricCreator: - return [lib.response.stats.token_count] - @staticmethod def token_count(tiktoken_encoding: Optional[str] = None) -> MetricCreator: """ diff --git a/langkit/metrics/text_statistics.py b/langkit/metrics/text_statistics.py index 03b5ca8..b5229d0 100644 --- a/langkit/metrics/text_statistics.py +++ b/langkit/metrics/text_statistics.py @@ -14,7 +14,7 @@ def udf(text: pd.DataFrame) -> SingleMetricResult: return SingleMetricResult(metrics) return SingleMetric( - name=f"{column_name}.text_stat.{stat}", + name=f"{column_name}.stats.{stat}", input_name=column_name, evaluate=udf, ) diff --git a/langkit/metrics/token.py b/langkit/metrics/token.py index 939650f..a4ed229 100644 --- a/langkit/metrics/token.py +++ b/langkit/metrics/token.py @@ -12,6 +12,12 @@ def _get_encoder(encoding: str): def token_metric(column_name: str, encoding: str = "cl100k_base") -> Metric: + def cache_assets(): + _get_encoder(encoding) + + def init(): + _get_encoder(encoding) + def udf(text: pd.DataFrame) -> SingleMetricResult: encoder = _get_encoder(encoding) encoding_len = [len(encoder.encode(it)) for it in UdfInput(text).iter_column_rows(column_name)] @@ -21,6 +27,8 @@ def udf(text: pd.DataFrame) -> SingleMetricResult: name=f"{column_name}.stats.token_count", input_name=column_name, evaluate=udf, + init=init, + cache_assets=cache_assets, ) diff --git a/pyproject.toml b/pyproject.toml index fb98c1c..fc138e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langkit" -version = "0.0.87" +version = "0.0.88" description = "A language toolkit for monitoring LLM interactions" authors = ["WhyLabs.ai "] homepage = "https://docs.whylabs.ai/docs/large-language-model-monitoring" diff --git a/tests/langkit/callbacks/test_webhook.py b/tests/langkit/callbacks/test_webhook.py index e9044d1..e6a399c 100644 --- a/tests/langkit/callbacks/test_webhook.py +++ b/tests/langkit/callbacks/test_webhook.py @@ -7,8 +7,8 @@ def test_webhook_failures_dont_ruin_run(): wf = EvaluationWorkflow( - metrics=[metric_lib.prompt.text_stat.char_count], - validators=[validator_lib.constraint("prompt.text_stat.char_count", upper_threshold=5)], + metrics=[metric_lib.prompt.stats.char_count], + validators=[validator_lib.constraint("prompt.stats.char_count", upper_threshold=5)], callbacks=[callback_lib.webhook.basic_validation_failure("https://foo.bar")], # will fail, url doesn't exist ) @@ -17,10 +17,10 @@ def test_webhook_failures_dont_ruin_run(): assert result.validation_results.report == [ ValidationFailure( id="0", - metric="prompt.text_stat.char_count", + metric="prompt.stats.char_count", details="Value 10 is above threshold 5", value=10, upper_threshold=5, ) ] - assert result.metrics["prompt.text_stat.char_count"][0] == 10 + assert result.metrics["prompt.stats.char_count"][0] == 10 diff --git a/tests/langkit/metrics/test_library.py b/tests/langkit/metrics/test_library.py index 3dc9f2a..e09b14d 100644 --- a/tests/langkit/metrics/test_library.py +++ b/tests/langkit/metrics/test_library.py @@ -19,7 +19,7 @@ def test_recommended(): "prompt.pii.us_bank_number", "prompt.pii.redacted", "prompt.stats.token_count", - "prompt.text_stat.char_count", + "prompt.stats.char_count", "prompt.similarity.injection", "prompt.similarity.jailbreak", "response.pii.phone_number", @@ -29,8 +29,8 @@ def test_recommended(): "response.pii.us_bank_number", "response.pii.redacted", "response.stats.token_count", - "response.text_stat.char_count", - "response.text_stat.flesch_reading_ease", + "response.stats.char_count", + "response.stats.flesch_reading_ease", "response.sentiment.sentiment_score", "response.toxicity", "response.similarity.refusal", diff --git a/tests/langkit/metrics/test_metric_library.py b/tests/langkit/metrics/test_metric_library.py deleted file mode 100644 index c0f5516..0000000 --- a/tests/langkit/metrics/test_metric_library.py +++ /dev/null @@ -1,37 +0,0 @@ -from langkit.core.workflow import EvaluationWorkflow -from langkit.metrics.library import lib - - -def test_recommended(): - row = {"prompt": "Hi, how are you doing today?", "response": "I'm doing great, how about you?"} - wf = EvaluationWorkflow(metrics=[lib.presets.recommended()]) - - actual = wf.run(row) - - expected_columns = [ - "prompt.pii.phone_number", - "prompt.pii.email_address", - "prompt.pii.credit_card", - "prompt.pii.us_ssn", - "prompt.pii.us_bank_number", - "prompt.pii.redacted", - "prompt.stats.token_count", - "prompt.text_stat.char_count", - "prompt.similarity.injection", - "prompt.similarity.jailbreak", - "response.pii.phone_number", - "response.pii.email_address", - "response.pii.credit_card", - "response.pii.us_ssn", - "response.pii.us_bank_number", - "response.pii.redacted", - "response.stats.token_count", - "response.text_stat.char_count", - "response.text_stat.flesch_reading_ease", - "response.sentiment.sentiment_score", - "response.toxicity", - "response.similarity.refusal", - "id", - ] - - assert list(actual.metrics.columns) == expected_columns diff --git a/tests/langkit/metrics/test_text_statistics.py b/tests/langkit/metrics/test_text_statistics.py index 672abd5..cbd2492 100644 --- a/tests/langkit/metrics/test_text_statistics.py +++ b/tests/langkit/metrics/test_text_statistics.py @@ -90,34 +90,34 @@ def test_prompt_response_textstat_module(): expected_columns = [ "prompt", - "prompt.text_stat.char_count", - "prompt.text_stat.difficult_words", - "prompt.text_stat.flesch_kincaid_grade", - "prompt.text_stat.flesch_reading_ease", - "prompt.text_stat.letter_count", - "prompt.text_stat.lexicon_count", - "prompt.text_stat.sentence_count", - "prompt.text_stat.syllable_count", + "prompt.stats.char_count", + "prompt.stats.difficult_words", + "prompt.stats.flesch_kincaid_grade", + "prompt.stats.flesch_reading_ease", + "prompt.stats.letter_count", + "prompt.stats.lexicon_count", + "prompt.stats.sentence_count", + "prompt.stats.syllable_count", "response", - "response.text_stat.char_count", - "response.text_stat.difficult_words", - "response.text_stat.flesch_kincaid_grade", - "response.text_stat.flesch_reading_ease", - "response.text_stat.letter_count", - "response.text_stat.lexicon_count", - "response.text_stat.sentence_count", - "response.text_stat.syllable_count", + "response.stats.char_count", + "response.stats.difficult_words", + "response.stats.flesch_kincaid_grade", + "response.stats.flesch_reading_ease", + "response.stats.letter_count", + "response.stats.lexicon_count", + "response.stats.sentence_count", + "response.stats.syllable_count", ] assert actual.index.tolist() == expected_columns - assert actual["distribution/max"]["prompt.text_stat.char_count"] == len(row["prompt"].replace(" ", "")) - assert actual["distribution/max"]["response.text_stat.char_count"] == len(row["response"].replace(" ", "")) + assert actual["distribution/max"]["prompt.stats.char_count"] == len(row["prompt"].replace(" ", "")) + assert actual["distribution/max"]["response.stats.char_count"] == len(row["response"].replace(" ", "")) actual_row = _log(row, all_textstat_schema) assert actual_row.index.tolist() == expected_columns - assert actual_row["distribution/max"]["prompt.text_stat.char_count"] == len(row["prompt"].replace(" ", "")) - assert actual_row["distribution/max"]["response.text_stat.char_count"] == len(row["response"].replace(" ", "")) + assert actual_row["distribution/max"]["prompt.stats.char_count"] == len(row["prompt"].replace(" ", "")) + assert actual_row["distribution/max"]["response.stats.char_count"] == len(row["response"].replace(" ", "")) def test_prompt_textstat_module(): @@ -129,26 +129,26 @@ def test_prompt_textstat_module(): expected_columns = [ "prompt", - "prompt.text_stat.char_count", - "prompt.text_stat.difficult_words", - "prompt.text_stat.flesch_kincaid_grade", - "prompt.text_stat.flesch_reading_ease", - "prompt.text_stat.letter_count", - "prompt.text_stat.lexicon_count", - "prompt.text_stat.sentence_count", - "prompt.text_stat.syllable_count", + "prompt.stats.char_count", + "prompt.stats.difficult_words", + "prompt.stats.flesch_kincaid_grade", + "prompt.stats.flesch_reading_ease", + "prompt.stats.letter_count", + "prompt.stats.lexicon_count", + "prompt.stats.sentence_count", + "prompt.stats.syllable_count", "response", ] assert actual.index.tolist() == expected_columns - assert actual["distribution/max"]["prompt.text_stat.char_count"] == len(row["prompt"].replace(" ", "")) - assert "response.text_stat.char_count" not in actual["distribution/max"] + assert actual["distribution/max"]["prompt.stats.char_count"] == len(row["prompt"].replace(" ", "")) + assert "response.stats.char_count" not in actual["distribution/max"] actual_row = _log(row, prompt_textstat_schema) assert actual_row.index.tolist() == expected_columns - assert actual_row["distribution/max"]["prompt.text_stat.char_count"] == len(row["prompt"].replace(" ", "")) - assert "response.text_stat.char_count" not in actual_row["distribution/max"] + assert actual_row["distribution/max"]["prompt.stats.char_count"] == len(row["prompt"].replace(" ", "")) + assert "response.stats.char_count" not in actual_row["distribution/max"] def test_response_textstat_module(): @@ -161,25 +161,25 @@ def test_response_textstat_module(): expected_columns = [ "prompt", "response", - "response.text_stat.char_count", - "response.text_stat.difficult_words", - "response.text_stat.flesch_kincaid_grade", - "response.text_stat.flesch_reading_ease", - "response.text_stat.letter_count", - "response.text_stat.lexicon_count", - "response.text_stat.sentence_count", - "response.text_stat.syllable_count", + "response.stats.char_count", + "response.stats.difficult_words", + "response.stats.flesch_kincaid_grade", + "response.stats.flesch_reading_ease", + "response.stats.letter_count", + "response.stats.lexicon_count", + "response.stats.sentence_count", + "response.stats.syllable_count", ] assert actual.index.tolist() == expected_columns - assert "prompt.text_stat.char_count" not in actual["distribution/max"] - assert actual["distribution/max"]["response.text_stat.char_count"] == len(row["response"].replace(" ", "")) + assert "prompt.stats.char_count" not in actual["distribution/max"] + assert actual["distribution/max"]["response.stats.char_count"] == len(row["response"].replace(" ", "")) actual_row = _log(row, response_textstat_schema) assert actual_row.index.tolist() == expected_columns - assert "prompt.text_stat.char_count" not in actual_row["distribution/max"] - assert actual_row["distribution/max"]["response.text_stat.char_count"] == len(row["response"].replace(" ", "")) + assert "prompt.stats.char_count" not in actual_row["distribution/max"] + assert actual_row["distribution/max"]["response.stats.char_count"] == len(row["response"].replace(" ", "")) def test_prompt_reading_ease_module(): @@ -194,7 +194,7 @@ def test_prompt_reading_ease_module(): assert actual.index.tolist() == [ "prompt", - "prompt.text_stat.flesch_reading_ease", + "prompt.stats.flesch_reading_ease", "response", ] @@ -212,7 +212,7 @@ def test_response_reading_ease_module(): assert actual.index.tolist() == [ "prompt", "response", - "response.text_stat.flesch_reading_ease", + "response.stats.flesch_reading_ease", ] @@ -228,9 +228,9 @@ def test_prompt_response_flesch_kincaid_grade_level_module(): assert actual.index.tolist() == [ "prompt", - "prompt.text_stat.flesch_kincaid_grade", + "prompt.stats.flesch_kincaid_grade", "response", - "response.text_stat.flesch_kincaid_grade", + "response.stats.flesch_kincaid_grade", ] @@ -243,7 +243,7 @@ def test_prompt_char_count_module(): assert actual.index.tolist() == [ "prompt", - "prompt.text_stat.char_count", + "prompt.stats.char_count", "response", ] @@ -251,7 +251,7 @@ def test_prompt_char_count_module(): def test_prompt_char_count_0_module(): wf = EvaluationWorkflow( metrics=[prompt_char_count_metric, response_char_count_metric], - validators=[ConstraintValidator("prompt.text_stat.char_count", lower_threshold=2)], + validators=[ConstraintValidator("prompt.stats.char_count", lower_threshold=2)], ) df = pd.DataFrame( @@ -267,18 +267,18 @@ def test_prompt_char_count_0_module(): actual = wf.run(df) assert actual.metrics.columns.tolist() == [ - "prompt.text_stat.char_count", - "response.text_stat.char_count", + "prompt.stats.char_count", + "response.stats.char_count", "id", ] print(actual.metrics.transpose()) - assert actual.metrics["prompt.text_stat.char_count"][0] == 0 + assert actual.metrics["prompt.stats.char_count"][0] == 0 assert actual.validation_results == ValidationResult( report=[ ValidationFailure( id="0", - metric="prompt.text_stat.char_count", + metric="prompt.stats.char_count", details="Value 0 is below threshold 2", value=0, upper_threshold=None, @@ -289,7 +289,7 @@ def test_prompt_char_count_0_module(): def test_text_stat_group(): - wf = EvaluationWorkflow(metrics=[lib.prompt.text_stat()]) + wf = EvaluationWorkflow(metrics=[lib.prompt.stats()]) df = pd.DataFrame( { "prompt": [ @@ -306,26 +306,28 @@ def test_text_stat_group(): assert sorted(actual.metrics.columns.tolist()) == sorted( # pyright: ignore[reportUnknownArgumentType] [ "id", - "prompt.text_stat.char_count", - "prompt.text_stat.difficult_words", - "prompt.text_stat.flesch_kincaid_grade", - "prompt.text_stat.flesch_reading_ease", - "prompt.text_stat.letter_count", - "prompt.text_stat.lexicon_count", - "prompt.text_stat.sentence_count", - "prompt.text_stat.syllable_count", + "prompt.stats.char_count", + "prompt.stats.difficult_words", + "prompt.stats.flesch_kincaid_grade", + "prompt.stats.flesch_reading_ease", + "prompt.stats.letter_count", + "prompt.stats.lexicon_count", + "prompt.stats.sentence_count", + "prompt.stats.syllable_count", + "prompt.stats.token_count", ] ) print(actual.metrics.transpose()) - assert actual.metrics["prompt.text_stat.char_count"][0] == 4 - assert actual.metrics["prompt.text_stat.difficult_words"][0] == 0 - assert actual.metrics["prompt.text_stat.flesch_kincaid_grade"][0] == -3.5 - assert actual.metrics["prompt.text_stat.flesch_reading_ease"][0] == 121.22 - assert actual.metrics["prompt.text_stat.letter_count"][0] == 4 - assert actual.metrics["prompt.text_stat.lexicon_count"][0] == 1 - assert actual.metrics["prompt.text_stat.sentence_count"][0] == 1 - assert actual.metrics["prompt.text_stat.syllable_count"][0] == 1 + assert actual.metrics["prompt.stats.char_count"][0] == 4 + assert actual.metrics["prompt.stats.difficult_words"][0] == 0 + assert actual.metrics["prompt.stats.flesch_kincaid_grade"][0] == -3.5 + assert actual.metrics["prompt.stats.flesch_reading_ease"][0] == 121.22 + assert actual.metrics["prompt.stats.letter_count"][0] == 4 + assert actual.metrics["prompt.stats.lexicon_count"][0] == 1 + assert actual.metrics["prompt.stats.sentence_count"][0] == 1 + assert actual.metrics["prompt.stats.syllable_count"][0] == 1 + assert actual.metrics["prompt.stats.token_count"][0] == 1 def test_response_char_count_module(): @@ -338,7 +340,7 @@ def test_response_char_count_module(): assert actual.index.tolist() == [ "prompt", "response", - "response.text_stat.char_count", + "response.stats.char_count", ] @@ -365,18 +367,18 @@ def test_custom_module_combination(): expected_columns = [ "prompt", - "prompt.text_stat.char_count", - "prompt.text_stat.difficult_words", - "prompt.text_stat.flesch_reading_ease", + "prompt.stats.char_count", + "prompt.stats.difficult_words", + "prompt.stats.flesch_reading_ease", "response", - "response.text_stat.char_count", - "response.text_stat.sentence_count", + "response.stats.char_count", + "response.stats.sentence_count", ] assert list(actual.columns) == expected_metrics assert actual.index.tolist() == expected_columns - assert actual["distribution/max"]["prompt.text_stat.char_count"] == len(row["prompt"].replace(" ", "")) - assert actual["distribution/max"]["response.text_stat.char_count"] == len(row["response"].replace(" ", "")) + assert actual["distribution/max"]["prompt.stats.char_count"] == len(row["prompt"].replace(" ", "")) + assert actual["distribution/max"]["response.stats.char_count"] == len(row["response"].replace(" ", "")) # and you get the same results if you combine the modules in different ways @@ -397,8 +399,8 @@ def test_custom_module_combination(): assert list(actual.columns) == expected_metrics assert actual.index.tolist() == expected_columns - assert actual["distribution/max"]["prompt.text_stat.char_count"] == len(row["prompt"].replace(" ", "")) - assert actual["distribution/max"]["response.text_stat.char_count"] == len(row["response"].replace(" ", "")) + assert actual["distribution/max"]["prompt.stats.char_count"] == len(row["prompt"].replace(" ", "")) + assert actual["distribution/max"]["response.stats.char_count"] == len(row["response"].replace(" ", "")) def test_multi_text_stat_metric(): @@ -444,12 +446,12 @@ def udf(text: pd.DataFrame) -> MultiMetricResult: "prompt", "prompt.custom_textstat1", "prompt.custom_textstat2", - "prompt.text_stat.char_count", + "prompt.stats.char_count", "response", ] assert actual.index.tolist() == expected_columns - assert actual["distribution/max"]["prompt.text_stat.char_count"] == 28 + assert actual["distribution/max"]["prompt.stats.char_count"] == 28 assert actual["distribution/min"]["prompt.custom_textstat1"] == 2 assert actual["distribution/max"]["prompt.custom_textstat1"] == 26 assert actual["distribution/min"]["prompt.custom_textstat2"] == 4 diff --git a/tests/langkit/validators/test_comparison.py b/tests/langkit/validators/test_comparison.py index 6cc20b1..de22c59 100644 --- a/tests/langkit/validators/test_comparison.py +++ b/tests/langkit/validators/test_comparison.py @@ -8,19 +8,19 @@ def test_one_required(): with pytest.raises(Exception): - ConstraintValidator("prompt.text_stat.char_count") + ConstraintValidator("prompt.stats.char_count") def test_upper_threshold(): - validator = ConstraintValidator("prompt.text_stat.char_count", upper_threshold=5) - wf = EvaluationWorkflow(metrics=[metric_lib.prompt.text_stat.char_count], validators=[validator]) + validator = ConstraintValidator("prompt.stats.char_count", upper_threshold=5) + wf = EvaluationWorkflow(metrics=[metric_lib.prompt.stats.char_count], validators=[validator]) result = wf.run({"prompt": "1234567890"}) assert result.validation_results.report == [ ValidationFailure( id="0", - metric="prompt.text_stat.char_count", + metric="prompt.stats.char_count", details="Value 10 is above threshold 5", value=10, upper_threshold=5, @@ -29,15 +29,15 @@ def test_upper_threshold(): def test_lower_threshold(): - validator = ConstraintValidator("prompt.text_stat.char_count", lower_threshold=5) - wf = EvaluationWorkflow(metrics=[metric_lib.prompt.text_stat.char_count], validators=[validator]) + validator = ConstraintValidator("prompt.stats.char_count", lower_threshold=5) + wf = EvaluationWorkflow(metrics=[metric_lib.prompt.stats.char_count], validators=[validator]) result = wf.run({"prompt": "1"}) assert result.validation_results.report == [ ValidationFailure( id="0", - metric="prompt.text_stat.char_count", + metric="prompt.stats.char_count", details="Value 1 is below threshold 5", value=1, lower_threshold=5, @@ -46,15 +46,15 @@ def test_lower_threshold(): def test_upper_threshold_inclusive(): - validator = ConstraintValidator("prompt.text_stat.char_count", upper_threshold_inclusive=5) - wf = EvaluationWorkflow(metrics=[metric_lib.prompt.text_stat.char_count], validators=[validator]) + validator = ConstraintValidator("prompt.stats.char_count", upper_threshold_inclusive=5) + wf = EvaluationWorkflow(metrics=[metric_lib.prompt.stats.char_count], validators=[validator]) result = wf.run({"prompt": "12345"}) assert result.validation_results.report == [ ValidationFailure( id="0", - metric="prompt.text_stat.char_count", + metric="prompt.stats.char_count", details="Value 5 is above or equal to threshold 5", value=5, upper_threshold=5, @@ -63,15 +63,15 @@ def test_upper_threshold_inclusive(): def test_lower_threshold_inclusive(): - validator = ConstraintValidator("prompt.text_stat.char_count", lower_threshold_inclusive=5) - wf = EvaluationWorkflow(metrics=[metric_lib.prompt.text_stat.char_count], validators=[validator]) + validator = ConstraintValidator("prompt.stats.char_count", lower_threshold_inclusive=5) + wf = EvaluationWorkflow(metrics=[metric_lib.prompt.stats.char_count], validators=[validator]) result = wf.run({"prompt": "12345"}) assert result.validation_results.report == [ ValidationFailure( id="0", - metric="prompt.text_stat.char_count", + metric="prompt.stats.char_count", details="Value 5 is below or equal to threshold 5", value=5, lower_threshold=5, @@ -80,15 +80,15 @@ def test_lower_threshold_inclusive(): def test_one_of(): - validator = ConstraintValidator("prompt.text_stat.char_count", one_of=[1, 2, 3]) - wf = EvaluationWorkflow(metrics=[metric_lib.prompt.text_stat.char_count], validators=[validator]) + validator = ConstraintValidator("prompt.stats.char_count", one_of=[1, 2, 3]) + wf = EvaluationWorkflow(metrics=[metric_lib.prompt.stats.char_count], validators=[validator]) result = wf.run({"prompt": "asdf"}) assert result.validation_results.report == [ ValidationFailure( id="0", - metric="prompt.text_stat.char_count", + metric="prompt.stats.char_count", details="Value 4 is not in allowed values {1, 2, 3}", value=4, allowed_values=[1, 2, 3], @@ -97,15 +97,15 @@ def test_one_of(): def test_none_of(): - validator = ConstraintValidator("prompt.text_stat.char_count", none_of=[1, 2, 3]) - wf = EvaluationWorkflow(metrics=[metric_lib.prompt.text_stat.char_count], validators=[validator]) + validator = ConstraintValidator("prompt.stats.char_count", none_of=[1, 2, 3]) + wf = EvaluationWorkflow(metrics=[metric_lib.prompt.stats.char_count], validators=[validator]) result = wf.run({"prompt": "asd"}) assert result.validation_results.report == [ ValidationFailure( id="0", - metric="prompt.text_stat.char_count", + metric="prompt.stats.char_count", details="Value 3 is in disallowed values {1, 2, 3}", value=3, disallowed_values=[1, 2, 3],