whylabs · naddeoa · Mar 9, 2024 · Mar 8, 2024 · Mar 9, 2024 · Mar 9, 2024
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.0.87
+current_version = 0.0.88
 tag = False
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
 serialize = 

diff --git a/langkit/core/workflow.py b/langkit/core/workflow.py
@@ -178,8 +178,7 @@ def run(self, data: Dict[str, str]) -> EvaluationResult:
     def run(self, data: Union[pd.DataFrame, Row, Dict[str, str]]) -> EvaluationResult:
         start = time.perf_counter()
 
-        if not self._initialized:
-            self.init()
+        self.init()
 
         if not isinstance(data, pd.DataFrame):
             if not is_dict_with_strings(data):

diff --git a/langkit/metrics/library.py b/langkit/metrics/library.py
@@ -48,14 +48,14 @@ def recommended(prompt: bool = True, response: bool = True) -> MetricCreator:
 
             - prompt.pii.*
             - prompt.stats.token_count
-            - prompt.text_stat.char_count
+            - prompt.stats.char_count
             - prompt.similarity.injection
             - prompt.similarity.jailbreak
 
             - response.pii.*
             - response.stats.token_count
-            - response.text_stat.char_count
-            - response.text_stat.reading_ease
+            - response.stats.char_count
+            - response.stats.reading_ease
             - response.sentiment.sentiment_score
             - response.toxicity.toxicity_score
             - response.similarity.refusal
@@ -64,16 +64,16 @@ def recommended(prompt: bool = True, response: bool = True) -> MetricCreator:
             prompt_metrics = [
                 lib.prompt.pii,
                 lib.prompt.stats.token_count,
-                lib.prompt.text_stat.char_count,
+                lib.prompt.stats.char_count,
                 lib.prompt.similarity.injection,
                 lib.prompt.similarity.jailbreak,
             ]
 
             response_metrics = [
                 lib.response.pii,
                 lib.response.stats.token_count,
-                lib.response.text_stat.char_count,
-                lib.response.text_stat.reading_ease,
+                lib.response.stats.char_count,
+                lib.response.stats.reading_ease,
                 lib.response.sentiment.sentiment_score,
                 lib.response.toxicity.toxicity_score,
                 lib.response.similarity.refusal,
@@ -117,11 +117,11 @@ def toxicity_score() -> MetricCreator:
 
                 return prompt_toxicity_metric
 
-        class text_stat:
+        class stats:
             def __call__(self) -> MetricCreator:
                 from langkit.metrics.text_statistics import prompt_textstat_metric
 
-                return prompt_textstat_metric
+                return [lib.prompt.stats.token_count, prompt_textstat_metric]
 
             @staticmethod
             def char_count() -> MetricCreator:
@@ -171,10 +171,6 @@ def difficult_words() -> MetricCreator:
 
                 return prompt_difficult_words_metric
 
-        class stats:
-            def __call__(self) -> MetricCreator:
-                return [lib.prompt.stats.token_count]
-
             @staticmethod
             def token_count(tiktoken_encoding: Optional[str] = None) -> MetricCreator:
                 """
@@ -303,11 +299,11 @@ def toxicity_score() -> MetricCreator:
 
                 return response_toxicity_metric
 
-        class text_stat:
+        class stats:
             def __call__(self) -> MetricCreator:
                 from langkit.metrics.text_statistics import response_textstat_metric
 
-                return response_textstat_metric
+                return [lib.response.stats.token_count, response_textstat_metric]
 
             @staticmethod
             def char_count() -> MetricCreator:
@@ -357,10 +353,6 @@ def difficult_words() -> MetricCreator:
 
                 return response_difficult_words_metric
 
-        class stats:
-            def __call__(self) -> MetricCreator:
-                return [lib.response.stats.token_count]
-
             @staticmethod
             def token_count(tiktoken_encoding: Optional[str] = None) -> MetricCreator:
                 """

diff --git a/langkit/metrics/text_statistics.py b/langkit/metrics/text_statistics.py
@@ -14,7 +14,7 @@ def udf(text: pd.DataFrame) -> SingleMetricResult:
         return SingleMetricResult(metrics)
 
     return SingleMetric(
-        name=f"{column_name}.text_stat.{stat}",
+        name=f"{column_name}.stats.{stat}",
         input_name=column_name,
         evaluate=udf,
     )

diff --git a/langkit/metrics/token.py b/langkit/metrics/token.py
@@ -12,6 +12,12 @@ def _get_encoder(encoding: str):
 
 
 def token_metric(column_name: str, encoding: str = "cl100k_base") -> Metric:
+    def cache_assets():
+        _get_encoder(encoding)
+
+    def init():
+        _get_encoder(encoding)
+
     def udf(text: pd.DataFrame) -> SingleMetricResult:
         encoder = _get_encoder(encoding)
         encoding_len = [len(encoder.encode(it)) for it in UdfInput(text).iter_column_rows(column_name)]
@@ -21,6 +27,8 @@ def udf(text: pd.DataFrame) -> SingleMetricResult:
         name=f"{column_name}.stats.token_count",
         input_name=column_name,
         evaluate=udf,
+        init=init,
+        cache_assets=cache_assets,
     )
 
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langkit"
-version = "0.0.87"
+version = "0.0.88"
 description = "A language toolkit for monitoring LLM interactions"
 authors = ["WhyLabs.ai <[email protected]>"]
 homepage = "https://docs.whylabs.ai/docs/large-language-model-monitoring"

diff --git a/tests/langkit/callbacks/test_webhook.py b/tests/langkit/callbacks/test_webhook.py
@@ -7,8 +7,8 @@
 
 def test_webhook_failures_dont_ruin_run():
     wf = EvaluationWorkflow(
-        metrics=[metric_lib.prompt.text_stat.char_count],
-        validators=[validator_lib.constraint("prompt.text_stat.char_count", upper_threshold=5)],
+        metrics=[metric_lib.prompt.stats.char_count],
+        validators=[validator_lib.constraint("prompt.stats.char_count", upper_threshold=5)],
         callbacks=[callback_lib.webhook.basic_validation_failure("https://foo.bar")],  # will fail, url doesn't exist
     )
 
@@ -17,10 +17,10 @@ def test_webhook_failures_dont_ruin_run():
     assert result.validation_results.report == [
         ValidationFailure(
             id="0",
-            metric="prompt.text_stat.char_count",
+            metric="prompt.stats.char_count",
             details="Value 10 is above threshold 5",
             value=10,
             upper_threshold=5,
         )
     ]
-    assert result.metrics["prompt.text_stat.char_count"][0] == 10
+    assert result.metrics["prompt.stats.char_count"][0] == 10
diff --git a/tests/langkit/metrics/test_library.py b/tests/langkit/metrics/test_library.py
@@ -19,7 +19,7 @@ def test_recommended():
         "prompt.pii.us_bank_number",
         "prompt.pii.redacted",
         "prompt.stats.token_count",
-        "prompt.text_stat.char_count",
+        "prompt.stats.char_count",
         "prompt.similarity.injection",
         "prompt.similarity.jailbreak",
         "response.pii.phone_number",
@@ -29,8 +29,8 @@ def test_recommended():
         "response.pii.us_bank_number",
         "response.pii.redacted",
         "response.stats.token_count",
-        "response.text_stat.char_count",
-        "response.text_stat.flesch_reading_ease",
+        "response.stats.char_count",
+        "response.stats.flesch_reading_ease",
         "response.sentiment.sentiment_score",
         "response.toxicity",
         "response.similarity.refusal",

diff --git a/tests/langkit/metrics/test_metric_library.py b/tests/langkit/metrics/test_metric_library.py