From ec9268945997c5c963be0e483ca11d0f66071b8d Mon Sep 17 00:00:00 2001 From: Anthony Naddeo Date: Thu, 7 Mar 2024 21:18:17 -0800 Subject: [PATCH 1/3] Add init method to token metric --- langkit/core/workflow.py | 3 +-- langkit/metrics/token.py | 8 ++++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/langkit/core/workflow.py b/langkit/core/workflow.py index f3d363b..67b8705 100644 --- a/langkit/core/workflow.py +++ b/langkit/core/workflow.py @@ -178,8 +178,7 @@ def run(self, data: Dict[str, str]) -> EvaluationResult: def run(self, data: Union[pd.DataFrame, Row, Dict[str, str]]) -> EvaluationResult: start = time.perf_counter() - if not self._initialized: - self.init() + self.init() if not isinstance(data, pd.DataFrame): if not is_dict_with_strings(data): diff --git a/langkit/metrics/token.py b/langkit/metrics/token.py index 939650f..a4ed229 100644 --- a/langkit/metrics/token.py +++ b/langkit/metrics/token.py @@ -12,6 +12,12 @@ def _get_encoder(encoding: str): def token_metric(column_name: str, encoding: str = "cl100k_base") -> Metric: + def cache_assets(): + _get_encoder(encoding) + + def init(): + _get_encoder(encoding) + def udf(text: pd.DataFrame) -> SingleMetricResult: encoder = _get_encoder(encoding) encoding_len = [len(encoder.encode(it)) for it in UdfInput(text).iter_column_rows(column_name)] @@ -21,6 +27,8 @@ def udf(text: pd.DataFrame) -> SingleMetricResult: name=f"{column_name}.stats.token_count", input_name=column_name, evaluate=udf, + init=init, + cache_assets=cache_assets, ) From 6ac0c3f55a876f584a3240473a7d92099a8a906b Mon Sep 17 00:00:00 2001 From: Anthony Naddeo Date: Sat, 9 Mar 2024 09:43:40 -0800 Subject: [PATCH 2/3] Change the text_stat group to stats Avoiding using the name of the underlying library in the group names. All of the stuff from text stat is now apart of stats, along with tiktoken's token count. --- langkit/metrics/library.py | 28 ++- langkit/metrics/text_statistics.py | 2 +- tests/langkit/callbacks/test_webhook.py | 8 +- tests/langkit/metrics/test_library.py | 6 +- tests/langkit/metrics/test_metric_library.py | 37 ---- tests/langkit/metrics/test_text_statistics.py | 168 +++++++++--------- tests/langkit/validators/test_comparison.py | 38 ++-- 7 files changed, 122 insertions(+), 165 deletions(-) delete mode 100644 tests/langkit/metrics/test_metric_library.py diff --git a/langkit/metrics/library.py b/langkit/metrics/library.py index e3e99b3..6788722 100644 --- a/langkit/metrics/library.py +++ b/langkit/metrics/library.py @@ -48,14 +48,14 @@ def recommended(prompt: bool = True, response: bool = True) -> MetricCreator: - prompt.pii.* - prompt.stats.token_count - - prompt.text_stat.char_count + - prompt.stats.char_count - prompt.similarity.injection - prompt.similarity.jailbreak - response.pii.* - response.stats.token_count - - response.text_stat.char_count - - response.text_stat.reading_ease + - response.stats.char_count + - response.stats.reading_ease - response.sentiment.sentiment_score - response.toxicity.toxicity_score - response.similarity.refusal @@ -64,7 +64,7 @@ def recommended(prompt: bool = True, response: bool = True) -> MetricCreator: prompt_metrics = [ lib.prompt.pii, lib.prompt.stats.token_count, - lib.prompt.text_stat.char_count, + lib.prompt.stats.char_count, lib.prompt.similarity.injection, lib.prompt.similarity.jailbreak, ] @@ -72,8 +72,8 @@ def recommended(prompt: bool = True, response: bool = True) -> MetricCreator: response_metrics = [ lib.response.pii, lib.response.stats.token_count, - lib.response.text_stat.char_count, - lib.response.text_stat.reading_ease, + lib.response.stats.char_count, + lib.response.stats.reading_ease, lib.response.sentiment.sentiment_score, lib.response.toxicity.toxicity_score, lib.response.similarity.refusal, @@ -117,11 +117,11 @@ def toxicity_score() -> MetricCreator: return prompt_toxicity_metric - class text_stat: + class stats: def __call__(self) -> MetricCreator: from langkit.metrics.text_statistics import prompt_textstat_metric - return prompt_textstat_metric + return [lib.prompt.stats.token_count, prompt_textstat_metric] @staticmethod def char_count() -> MetricCreator: @@ -171,10 +171,6 @@ def difficult_words() -> MetricCreator: return prompt_difficult_words_metric - class stats: - def __call__(self) -> MetricCreator: - return [lib.prompt.stats.token_count] - @staticmethod def token_count(tiktoken_encoding: Optional[str] = None) -> MetricCreator: """ @@ -303,11 +299,11 @@ def toxicity_score() -> MetricCreator: return response_toxicity_metric - class text_stat: + class stats: def __call__(self) -> MetricCreator: from langkit.metrics.text_statistics import response_textstat_metric - return response_textstat_metric + return [lib.response.stats.token_count, response_textstat_metric] @staticmethod def char_count() -> MetricCreator: @@ -357,10 +353,6 @@ def difficult_words() -> MetricCreator: return response_difficult_words_metric - class stats: - def __call__(self) -> MetricCreator: - return [lib.response.stats.token_count] - @staticmethod def token_count(tiktoken_encoding: Optional[str] = None) -> MetricCreator: """ diff --git a/langkit/metrics/text_statistics.py b/langkit/metrics/text_statistics.py index 03b5ca8..b5229d0 100644 --- a/langkit/metrics/text_statistics.py +++ b/langkit/metrics/text_statistics.py @@ -14,7 +14,7 @@ def udf(text: pd.DataFrame) -> SingleMetricResult: return SingleMetricResult(metrics) return SingleMetric( - name=f"{column_name}.text_stat.{stat}", + name=f"{column_name}.stats.{stat}", input_name=column_name, evaluate=udf, ) diff --git a/tests/langkit/callbacks/test_webhook.py b/tests/langkit/callbacks/test_webhook.py index e9044d1..e6a399c 100644 --- a/tests/langkit/callbacks/test_webhook.py +++ b/tests/langkit/callbacks/test_webhook.py @@ -7,8 +7,8 @@ def test_webhook_failures_dont_ruin_run(): wf = EvaluationWorkflow( - metrics=[metric_lib.prompt.text_stat.char_count], - validators=[validator_lib.constraint("prompt.text_stat.char_count", upper_threshold=5)], + metrics=[metric_lib.prompt.stats.char_count], + validators=[validator_lib.constraint("prompt.stats.char_count", upper_threshold=5)], callbacks=[callback_lib.webhook.basic_validation_failure("https://foo.bar")], # will fail, url doesn't exist ) @@ -17,10 +17,10 @@ def test_webhook_failures_dont_ruin_run(): assert result.validation_results.report == [ ValidationFailure( id="0", - metric="prompt.text_stat.char_count", + metric="prompt.stats.char_count", details="Value 10 is above threshold 5", value=10, upper_threshold=5, ) ] - assert result.metrics["prompt.text_stat.char_count"][0] == 10 + assert result.metrics["prompt.stats.char_count"][0] == 10 diff --git a/tests/langkit/metrics/test_library.py b/tests/langkit/metrics/test_library.py index 3dc9f2a..e09b14d 100644 --- a/tests/langkit/metrics/test_library.py +++ b/tests/langkit/metrics/test_library.py @@ -19,7 +19,7 @@ def test_recommended(): "prompt.pii.us_bank_number", "prompt.pii.redacted", "prompt.stats.token_count", - "prompt.text_stat.char_count", + "prompt.stats.char_count", "prompt.similarity.injection", "prompt.similarity.jailbreak", "response.pii.phone_number", @@ -29,8 +29,8 @@ def test_recommended(): "response.pii.us_bank_number", "response.pii.redacted", "response.stats.token_count", - "response.text_stat.char_count", - "response.text_stat.flesch_reading_ease", + "response.stats.char_count", + "response.stats.flesch_reading_ease", "response.sentiment.sentiment_score", "response.toxicity", "response.similarity.refusal", diff --git a/tests/langkit/metrics/test_metric_library.py b/tests/langkit/metrics/test_metric_library.py deleted file mode 100644 index c0f5516..0000000 --- a/tests/langkit/metrics/test_metric_library.py +++ /dev/null @@ -1,37 +0,0 @@ -from langkit.core.workflow import EvaluationWorkflow -from langkit.metrics.library import lib - - -def test_recommended(): - row = {"prompt": "Hi, how are you doing today?", "response": "I'm doing great, how about you?"} - wf = EvaluationWorkflow(metrics=[lib.presets.recommended()]) - - actual = wf.run(row) - - expected_columns = [ - "prompt.pii.phone_number", - "prompt.pii.email_address", - "prompt.pii.credit_card", - "prompt.pii.us_ssn", - "prompt.pii.us_bank_number", - "prompt.pii.redacted", - "prompt.stats.token_count", - "prompt.text_stat.char_count", - "prompt.similarity.injection", - "prompt.similarity.jailbreak", - "response.pii.phone_number", - "response.pii.email_address", - "response.pii.credit_card", - "response.pii.us_ssn", - "response.pii.us_bank_number", - "response.pii.redacted", - "response.stats.token_count", - "response.text_stat.char_count", - "response.text_stat.flesch_reading_ease", - "response.sentiment.sentiment_score", - "response.toxicity", - "response.similarity.refusal", - "id", - ] - - assert list(actual.metrics.columns) == expected_columns diff --git a/tests/langkit/metrics/test_text_statistics.py b/tests/langkit/metrics/test_text_statistics.py index 672abd5..cbd2492 100644 --- a/tests/langkit/metrics/test_text_statistics.py +++ b/tests/langkit/metrics/test_text_statistics.py @@ -90,34 +90,34 @@ def test_prompt_response_textstat_module(): expected_columns = [ "prompt", - "prompt.text_stat.char_count", - "prompt.text_stat.difficult_words", - "prompt.text_stat.flesch_kincaid_grade", - "prompt.text_stat.flesch_reading_ease", - "prompt.text_stat.letter_count", - "prompt.text_stat.lexicon_count", - "prompt.text_stat.sentence_count", - "prompt.text_stat.syllable_count", + "prompt.stats.char_count", + "prompt.stats.difficult_words", + "prompt.stats.flesch_kincaid_grade", + "prompt.stats.flesch_reading_ease", + "prompt.stats.letter_count", + "prompt.stats.lexicon_count", + "prompt.stats.sentence_count", + "prompt.stats.syllable_count", "response", - "response.text_stat.char_count", - "response.text_stat.difficult_words", - "response.text_stat.flesch_kincaid_grade", - "response.text_stat.flesch_reading_ease", - "response.text_stat.letter_count", - "response.text_stat.lexicon_count", - "response.text_stat.sentence_count", - "response.text_stat.syllable_count", + "response.stats.char_count", + "response.stats.difficult_words", + "response.stats.flesch_kincaid_grade", + "response.stats.flesch_reading_ease", + "response.stats.letter_count", + "response.stats.lexicon_count", + "response.stats.sentence_count", + "response.stats.syllable_count", ] assert actual.index.tolist() == expected_columns - assert actual["distribution/max"]["prompt.text_stat.char_count"] == len(row["prompt"].replace(" ", "")) - assert actual["distribution/max"]["response.text_stat.char_count"] == len(row["response"].replace(" ", "")) + assert actual["distribution/max"]["prompt.stats.char_count"] == len(row["prompt"].replace(" ", "")) + assert actual["distribution/max"]["response.stats.char_count"] == len(row["response"].replace(" ", "")) actual_row = _log(row, all_textstat_schema) assert actual_row.index.tolist() == expected_columns - assert actual_row["distribution/max"]["prompt.text_stat.char_count"] == len(row["prompt"].replace(" ", "")) - assert actual_row["distribution/max"]["response.text_stat.char_count"] == len(row["response"].replace(" ", "")) + assert actual_row["distribution/max"]["prompt.stats.char_count"] == len(row["prompt"].replace(" ", "")) + assert actual_row["distribution/max"]["response.stats.char_count"] == len(row["response"].replace(" ", "")) def test_prompt_textstat_module(): @@ -129,26 +129,26 @@ def test_prompt_textstat_module(): expected_columns = [ "prompt", - "prompt.text_stat.char_count", - "prompt.text_stat.difficult_words", - "prompt.text_stat.flesch_kincaid_grade", - "prompt.text_stat.flesch_reading_ease", - "prompt.text_stat.letter_count", - "prompt.text_stat.lexicon_count", - "prompt.text_stat.sentence_count", - "prompt.text_stat.syllable_count", + "prompt.stats.char_count", + "prompt.stats.difficult_words", + "prompt.stats.flesch_kincaid_grade", + "prompt.stats.flesch_reading_ease", + "prompt.stats.letter_count", + "prompt.stats.lexicon_count", + "prompt.stats.sentence_count", + "prompt.stats.syllable_count", "response", ] assert actual.index.tolist() == expected_columns - assert actual["distribution/max"]["prompt.text_stat.char_count"] == len(row["prompt"].replace(" ", "")) - assert "response.text_stat.char_count" not in actual["distribution/max"] + assert actual["distribution/max"]["prompt.stats.char_count"] == len(row["prompt"].replace(" ", "")) + assert "response.stats.char_count" not in actual["distribution/max"] actual_row = _log(row, prompt_textstat_schema) assert actual_row.index.tolist() == expected_columns - assert actual_row["distribution/max"]["prompt.text_stat.char_count"] == len(row["prompt"].replace(" ", "")) - assert "response.text_stat.char_count" not in actual_row["distribution/max"] + assert actual_row["distribution/max"]["prompt.stats.char_count"] == len(row["prompt"].replace(" ", "")) + assert "response.stats.char_count" not in actual_row["distribution/max"] def test_response_textstat_module(): @@ -161,25 +161,25 @@ def test_response_textstat_module(): expected_columns = [ "prompt", "response", - "response.text_stat.char_count", - "response.text_stat.difficult_words", - "response.text_stat.flesch_kincaid_grade", - "response.text_stat.flesch_reading_ease", - "response.text_stat.letter_count", - "response.text_stat.lexicon_count", - "response.text_stat.sentence_count", - "response.text_stat.syllable_count", + "response.stats.char_count", + "response.stats.difficult_words", + "response.stats.flesch_kincaid_grade", + "response.stats.flesch_reading_ease", + "response.stats.letter_count", + "response.stats.lexicon_count", + "response.stats.sentence_count", + "response.stats.syllable_count", ] assert actual.index.tolist() == expected_columns - assert "prompt.text_stat.char_count" not in actual["distribution/max"] - assert actual["distribution/max"]["response.text_stat.char_count"] == len(row["response"].replace(" ", "")) + assert "prompt.stats.char_count" not in actual["distribution/max"] + assert actual["distribution/max"]["response.stats.char_count"] == len(row["response"].replace(" ", "")) actual_row = _log(row, response_textstat_schema) assert actual_row.index.tolist() == expected_columns - assert "prompt.text_stat.char_count" not in actual_row["distribution/max"] - assert actual_row["distribution/max"]["response.text_stat.char_count"] == len(row["response"].replace(" ", "")) + assert "prompt.stats.char_count" not in actual_row["distribution/max"] + assert actual_row["distribution/max"]["response.stats.char_count"] == len(row["response"].replace(" ", "")) def test_prompt_reading_ease_module(): @@ -194,7 +194,7 @@ def test_prompt_reading_ease_module(): assert actual.index.tolist() == [ "prompt", - "prompt.text_stat.flesch_reading_ease", + "prompt.stats.flesch_reading_ease", "response", ] @@ -212,7 +212,7 @@ def test_response_reading_ease_module(): assert actual.index.tolist() == [ "prompt", "response", - "response.text_stat.flesch_reading_ease", + "response.stats.flesch_reading_ease", ] @@ -228,9 +228,9 @@ def test_prompt_response_flesch_kincaid_grade_level_module(): assert actual.index.tolist() == [ "prompt", - "prompt.text_stat.flesch_kincaid_grade", + "prompt.stats.flesch_kincaid_grade", "response", - "response.text_stat.flesch_kincaid_grade", + "response.stats.flesch_kincaid_grade", ] @@ -243,7 +243,7 @@ def test_prompt_char_count_module(): assert actual.index.tolist() == [ "prompt", - "prompt.text_stat.char_count", + "prompt.stats.char_count", "response", ] @@ -251,7 +251,7 @@ def test_prompt_char_count_module(): def test_prompt_char_count_0_module(): wf = EvaluationWorkflow( metrics=[prompt_char_count_metric, response_char_count_metric], - validators=[ConstraintValidator("prompt.text_stat.char_count", lower_threshold=2)], + validators=[ConstraintValidator("prompt.stats.char_count", lower_threshold=2)], ) df = pd.DataFrame( @@ -267,18 +267,18 @@ def test_prompt_char_count_0_module(): actual = wf.run(df) assert actual.metrics.columns.tolist() == [ - "prompt.text_stat.char_count", - "response.text_stat.char_count", + "prompt.stats.char_count", + "response.stats.char_count", "id", ] print(actual.metrics.transpose()) - assert actual.metrics["prompt.text_stat.char_count"][0] == 0 + assert actual.metrics["prompt.stats.char_count"][0] == 0 assert actual.validation_results == ValidationResult( report=[ ValidationFailure( id="0", - metric="prompt.text_stat.char_count", + metric="prompt.stats.char_count", details="Value 0 is below threshold 2", value=0, upper_threshold=None, @@ -289,7 +289,7 @@ def test_prompt_char_count_0_module(): def test_text_stat_group(): - wf = EvaluationWorkflow(metrics=[lib.prompt.text_stat()]) + wf = EvaluationWorkflow(metrics=[lib.prompt.stats()]) df = pd.DataFrame( { "prompt": [ @@ -306,26 +306,28 @@ def test_text_stat_group(): assert sorted(actual.metrics.columns.tolist()) == sorted( # pyright: ignore[reportUnknownArgumentType] [ "id", - "prompt.text_stat.char_count", - "prompt.text_stat.difficult_words", - "prompt.text_stat.flesch_kincaid_grade", - "prompt.text_stat.flesch_reading_ease", - "prompt.text_stat.letter_count", - "prompt.text_stat.lexicon_count", - "prompt.text_stat.sentence_count", - "prompt.text_stat.syllable_count", + "prompt.stats.char_count", + "prompt.stats.difficult_words", + "prompt.stats.flesch_kincaid_grade", + "prompt.stats.flesch_reading_ease", + "prompt.stats.letter_count", + "prompt.stats.lexicon_count", + "prompt.stats.sentence_count", + "prompt.stats.syllable_count", + "prompt.stats.token_count", ] ) print(actual.metrics.transpose()) - assert actual.metrics["prompt.text_stat.char_count"][0] == 4 - assert actual.metrics["prompt.text_stat.difficult_words"][0] == 0 - assert actual.metrics["prompt.text_stat.flesch_kincaid_grade"][0] == -3.5 - assert actual.metrics["prompt.text_stat.flesch_reading_ease"][0] == 121.22 - assert actual.metrics["prompt.text_stat.letter_count"][0] == 4 - assert actual.metrics["prompt.text_stat.lexicon_count"][0] == 1 - assert actual.metrics["prompt.text_stat.sentence_count"][0] == 1 - assert actual.metrics["prompt.text_stat.syllable_count"][0] == 1 + assert actual.metrics["prompt.stats.char_count"][0] == 4 + assert actual.metrics["prompt.stats.difficult_words"][0] == 0 + assert actual.metrics["prompt.stats.flesch_kincaid_grade"][0] == -3.5 + assert actual.metrics["prompt.stats.flesch_reading_ease"][0] == 121.22 + assert actual.metrics["prompt.stats.letter_count"][0] == 4 + assert actual.metrics["prompt.stats.lexicon_count"][0] == 1 + assert actual.metrics["prompt.stats.sentence_count"][0] == 1 + assert actual.metrics["prompt.stats.syllable_count"][0] == 1 + assert actual.metrics["prompt.stats.token_count"][0] == 1 def test_response_char_count_module(): @@ -338,7 +340,7 @@ def test_response_char_count_module(): assert actual.index.tolist() == [ "prompt", "response", - "response.text_stat.char_count", + "response.stats.char_count", ] @@ -365,18 +367,18 @@ def test_custom_module_combination(): expected_columns = [ "prompt", - "prompt.text_stat.char_count", - "prompt.text_stat.difficult_words", - "prompt.text_stat.flesch_reading_ease", + "prompt.stats.char_count", + "prompt.stats.difficult_words", + "prompt.stats.flesch_reading_ease", "response", - "response.text_stat.char_count", - "response.text_stat.sentence_count", + "response.stats.char_count", + "response.stats.sentence_count", ] assert list(actual.columns) == expected_metrics assert actual.index.tolist() == expected_columns - assert actual["distribution/max"]["prompt.text_stat.char_count"] == len(row["prompt"].replace(" ", "")) - assert actual["distribution/max"]["response.text_stat.char_count"] == len(row["response"].replace(" ", "")) + assert actual["distribution/max"]["prompt.stats.char_count"] == len(row["prompt"].replace(" ", "")) + assert actual["distribution/max"]["response.stats.char_count"] == len(row["response"].replace(" ", "")) # and you get the same results if you combine the modules in different ways @@ -397,8 +399,8 @@ def test_custom_module_combination(): assert list(actual.columns) == expected_metrics assert actual.index.tolist() == expected_columns - assert actual["distribution/max"]["prompt.text_stat.char_count"] == len(row["prompt"].replace(" ", "")) - assert actual["distribution/max"]["response.text_stat.char_count"] == len(row["response"].replace(" ", "")) + assert actual["distribution/max"]["prompt.stats.char_count"] == len(row["prompt"].replace(" ", "")) + assert actual["distribution/max"]["response.stats.char_count"] == len(row["response"].replace(" ", "")) def test_multi_text_stat_metric(): @@ -444,12 +446,12 @@ def udf(text: pd.DataFrame) -> MultiMetricResult: "prompt", "prompt.custom_textstat1", "prompt.custom_textstat2", - "prompt.text_stat.char_count", + "prompt.stats.char_count", "response", ] assert actual.index.tolist() == expected_columns - assert actual["distribution/max"]["prompt.text_stat.char_count"] == 28 + assert actual["distribution/max"]["prompt.stats.char_count"] == 28 assert actual["distribution/min"]["prompt.custom_textstat1"] == 2 assert actual["distribution/max"]["prompt.custom_textstat1"] == 26 assert actual["distribution/min"]["prompt.custom_textstat2"] == 4 diff --git a/tests/langkit/validators/test_comparison.py b/tests/langkit/validators/test_comparison.py index 6cc20b1..de22c59 100644 --- a/tests/langkit/validators/test_comparison.py +++ b/tests/langkit/validators/test_comparison.py @@ -8,19 +8,19 @@ def test_one_required(): with pytest.raises(Exception): - ConstraintValidator("prompt.text_stat.char_count") + ConstraintValidator("prompt.stats.char_count") def test_upper_threshold(): - validator = ConstraintValidator("prompt.text_stat.char_count", upper_threshold=5) - wf = EvaluationWorkflow(metrics=[metric_lib.prompt.text_stat.char_count], validators=[validator]) + validator = ConstraintValidator("prompt.stats.char_count", upper_threshold=5) + wf = EvaluationWorkflow(metrics=[metric_lib.prompt.stats.char_count], validators=[validator]) result = wf.run({"prompt": "1234567890"}) assert result.validation_results.report == [ ValidationFailure( id="0", - metric="prompt.text_stat.char_count", + metric="prompt.stats.char_count", details="Value 10 is above threshold 5", value=10, upper_threshold=5, @@ -29,15 +29,15 @@ def test_upper_threshold(): def test_lower_threshold(): - validator = ConstraintValidator("prompt.text_stat.char_count", lower_threshold=5) - wf = EvaluationWorkflow(metrics=[metric_lib.prompt.text_stat.char_count], validators=[validator]) + validator = ConstraintValidator("prompt.stats.char_count", lower_threshold=5) + wf = EvaluationWorkflow(metrics=[metric_lib.prompt.stats.char_count], validators=[validator]) result = wf.run({"prompt": "1"}) assert result.validation_results.report == [ ValidationFailure( id="0", - metric="prompt.text_stat.char_count", + metric="prompt.stats.char_count", details="Value 1 is below threshold 5", value=1, lower_threshold=5, @@ -46,15 +46,15 @@ def test_lower_threshold(): def test_upper_threshold_inclusive(): - validator = ConstraintValidator("prompt.text_stat.char_count", upper_threshold_inclusive=5) - wf = EvaluationWorkflow(metrics=[metric_lib.prompt.text_stat.char_count], validators=[validator]) + validator = ConstraintValidator("prompt.stats.char_count", upper_threshold_inclusive=5) + wf = EvaluationWorkflow(metrics=[metric_lib.prompt.stats.char_count], validators=[validator]) result = wf.run({"prompt": "12345"}) assert result.validation_results.report == [ ValidationFailure( id="0", - metric="prompt.text_stat.char_count", + metric="prompt.stats.char_count", details="Value 5 is above or equal to threshold 5", value=5, upper_threshold=5, @@ -63,15 +63,15 @@ def test_upper_threshold_inclusive(): def test_lower_threshold_inclusive(): - validator = ConstraintValidator("prompt.text_stat.char_count", lower_threshold_inclusive=5) - wf = EvaluationWorkflow(metrics=[metric_lib.prompt.text_stat.char_count], validators=[validator]) + validator = ConstraintValidator("prompt.stats.char_count", lower_threshold_inclusive=5) + wf = EvaluationWorkflow(metrics=[metric_lib.prompt.stats.char_count], validators=[validator]) result = wf.run({"prompt": "12345"}) assert result.validation_results.report == [ ValidationFailure( id="0", - metric="prompt.text_stat.char_count", + metric="prompt.stats.char_count", details="Value 5 is below or equal to threshold 5", value=5, lower_threshold=5, @@ -80,15 +80,15 @@ def test_lower_threshold_inclusive(): def test_one_of(): - validator = ConstraintValidator("prompt.text_stat.char_count", one_of=[1, 2, 3]) - wf = EvaluationWorkflow(metrics=[metric_lib.prompt.text_stat.char_count], validators=[validator]) + validator = ConstraintValidator("prompt.stats.char_count", one_of=[1, 2, 3]) + wf = EvaluationWorkflow(metrics=[metric_lib.prompt.stats.char_count], validators=[validator]) result = wf.run({"prompt": "asdf"}) assert result.validation_results.report == [ ValidationFailure( id="0", - metric="prompt.text_stat.char_count", + metric="prompt.stats.char_count", details="Value 4 is not in allowed values {1, 2, 3}", value=4, allowed_values=[1, 2, 3], @@ -97,15 +97,15 @@ def test_one_of(): def test_none_of(): - validator = ConstraintValidator("prompt.text_stat.char_count", none_of=[1, 2, 3]) - wf = EvaluationWorkflow(metrics=[metric_lib.prompt.text_stat.char_count], validators=[validator]) + validator = ConstraintValidator("prompt.stats.char_count", none_of=[1, 2, 3]) + wf = EvaluationWorkflow(metrics=[metric_lib.prompt.stats.char_count], validators=[validator]) result = wf.run({"prompt": "asd"}) assert result.validation_results.report == [ ValidationFailure( id="0", - metric="prompt.text_stat.char_count", + metric="prompt.stats.char_count", details="Value 3 is in disallowed values {1, 2, 3}", value=3, disallowed_values=[1, 2, 3], From c5844202d6fb6e8d3bb7bce7280f10426dbf031c Mon Sep 17 00:00:00 2001 From: Anthony Naddeo Date: Sat, 9 Mar 2024 09:55:51 -0800 Subject: [PATCH 3/3] bump version --- .bumpversion.cfg | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 2556f98..426909e 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.0.87 +current_version = 0.0.88 tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? serialize = diff --git a/pyproject.toml b/pyproject.toml index fb98c1c..fc138e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langkit" -version = "0.0.87" +version = "0.0.88" description = "A language toolkit for monitoring LLM interactions" authors = ["WhyLabs.ai "] homepage = "https://docs.whylabs.ai/docs/large-language-model-monitoring"