diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index 2556f98..426909e 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.0.87
+current_version = 0.0.88
 tag = False
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
 serialize = 
diff --git a/langkit/core/workflow.py b/langkit/core/workflow.py
index f3d363b..67b8705 100644
--- a/langkit/core/workflow.py
+++ b/langkit/core/workflow.py
@@ -178,8 +178,7 @@ def run(self, data: Dict[str, str]) -> EvaluationResult:
     def run(self, data: Union[pd.DataFrame, Row, Dict[str, str]]) -> EvaluationResult:
         start = time.perf_counter()
 
-        if not self._initialized:
-            self.init()
+        self.init()
 
         if not isinstance(data, pd.DataFrame):
             if not is_dict_with_strings(data):
diff --git a/langkit/metrics/library.py b/langkit/metrics/library.py
index e3e99b3..6788722 100644
--- a/langkit/metrics/library.py
+++ b/langkit/metrics/library.py
@@ -48,14 +48,14 @@ def recommended(prompt: bool = True, response: bool = True) -> MetricCreator:
 
             - prompt.pii.*
             - prompt.stats.token_count
-            - prompt.text_stat.char_count
+            - prompt.stats.char_count
             - prompt.similarity.injection
             - prompt.similarity.jailbreak
 
             - response.pii.*
             - response.stats.token_count
-            - response.text_stat.char_count
-            - response.text_stat.reading_ease
+            - response.stats.char_count
+            - response.stats.reading_ease
             - response.sentiment.sentiment_score
             - response.toxicity.toxicity_score
             - response.similarity.refusal
@@ -64,7 +64,7 @@ def recommended(prompt: bool = True, response: bool = True) -> MetricCreator:
             prompt_metrics = [
                 lib.prompt.pii,
                 lib.prompt.stats.token_count,
-                lib.prompt.text_stat.char_count,
+                lib.prompt.stats.char_count,
                 lib.prompt.similarity.injection,
                 lib.prompt.similarity.jailbreak,
             ]
@@ -72,8 +72,8 @@ def recommended(prompt: bool = True, response: bool = True) -> MetricCreator:
             response_metrics = [
                 lib.response.pii,
                 lib.response.stats.token_count,
-                lib.response.text_stat.char_count,
-                lib.response.text_stat.reading_ease,
+                lib.response.stats.char_count,
+                lib.response.stats.reading_ease,
                 lib.response.sentiment.sentiment_score,
                 lib.response.toxicity.toxicity_score,
                 lib.response.similarity.refusal,
@@ -117,11 +117,11 @@ def toxicity_score() -> MetricCreator:
 
                 return prompt_toxicity_metric
 
-        class text_stat:
+        class stats:
             def __call__(self) -> MetricCreator:
                 from langkit.metrics.text_statistics import prompt_textstat_metric
 
-                return prompt_textstat_metric
+                return [lib.prompt.stats.token_count, prompt_textstat_metric]
 
             @staticmethod
             def char_count() -> MetricCreator:
@@ -171,10 +171,6 @@ def difficult_words() -> MetricCreator:
 
                 return prompt_difficult_words_metric
 
-        class stats:
-            def __call__(self) -> MetricCreator:
-                return [lib.prompt.stats.token_count]
-
             @staticmethod
             def token_count(tiktoken_encoding: Optional[str] = None) -> MetricCreator:
                 """
@@ -303,11 +299,11 @@ def toxicity_score() -> MetricCreator:
 
                 return response_toxicity_metric
 
-        class text_stat:
+        class stats:
             def __call__(self) -> MetricCreator:
                 from langkit.metrics.text_statistics import response_textstat_metric
 
-                return response_textstat_metric
+                return [lib.response.stats.token_count, response_textstat_metric]
 
             @staticmethod
             def char_count() -> MetricCreator:
@@ -357,10 +353,6 @@ def difficult_words() -> MetricCreator:
 
                 return response_difficult_words_metric
 
-        class stats:
-            def __call__(self) -> MetricCreator:
-                return [lib.response.stats.token_count]
-
             @staticmethod
             def token_count(tiktoken_encoding: Optional[str] = None) -> MetricCreator:
                 """
diff --git a/langkit/metrics/text_statistics.py b/langkit/metrics/text_statistics.py
index 03b5ca8..b5229d0 100644
--- a/langkit/metrics/text_statistics.py
+++ b/langkit/metrics/text_statistics.py
@@ -14,7 +14,7 @@ def udf(text: pd.DataFrame) -> SingleMetricResult:
         return SingleMetricResult(metrics)
 
     return SingleMetric(
-        name=f"{column_name}.text_stat.{stat}",
+        name=f"{column_name}.stats.{stat}",
         input_name=column_name,
         evaluate=udf,
     )
diff --git a/langkit/metrics/token.py b/langkit/metrics/token.py
index 939650f..a4ed229 100644
--- a/langkit/metrics/token.py
+++ b/langkit/metrics/token.py
@@ -12,6 +12,12 @@ def _get_encoder(encoding: str):
 
 
 def token_metric(column_name: str, encoding: str = "cl100k_base") -> Metric:
+    def cache_assets():
+        _get_encoder(encoding)
+
+    def init():
+        _get_encoder(encoding)
+
     def udf(text: pd.DataFrame) -> SingleMetricResult:
         encoder = _get_encoder(encoding)
         encoding_len = [len(encoder.encode(it)) for it in UdfInput(text).iter_column_rows(column_name)]
@@ -21,6 +27,8 @@ def udf(text: pd.DataFrame) -> SingleMetricResult:
         name=f"{column_name}.stats.token_count",
         input_name=column_name,
         evaluate=udf,
+        init=init,
+        cache_assets=cache_assets,
     )
 
 
diff --git a/pyproject.toml b/pyproject.toml
index fb98c1c..fc138e8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langkit"
-version = "0.0.87"
+version = "0.0.88"
 description = "A language toolkit for monitoring LLM interactions"
 authors = ["WhyLabs.ai <langkit@whylabs.ai>"]
 homepage = "https://docs.whylabs.ai/docs/large-language-model-monitoring"
diff --git a/tests/langkit/callbacks/test_webhook.py b/tests/langkit/callbacks/test_webhook.py
index e9044d1..e6a399c 100644
--- a/tests/langkit/callbacks/test_webhook.py
+++ b/tests/langkit/callbacks/test_webhook.py
@@ -7,8 +7,8 @@
 
 def test_webhook_failures_dont_ruin_run():
     wf = EvaluationWorkflow(
-        metrics=[metric_lib.prompt.text_stat.char_count],
-        validators=[validator_lib.constraint("prompt.text_stat.char_count", upper_threshold=5)],
+        metrics=[metric_lib.prompt.stats.char_count],
+        validators=[validator_lib.constraint("prompt.stats.char_count", upper_threshold=5)],
         callbacks=[callback_lib.webhook.basic_validation_failure("https://foo.bar")],  # will fail, url doesn't exist
     )
 
@@ -17,10 +17,10 @@ def test_webhook_failures_dont_ruin_run():
     assert result.validation_results.report == [
         ValidationFailure(
             id="0",
-            metric="prompt.text_stat.char_count",
+            metric="prompt.stats.char_count",
             details="Value 10 is above threshold 5",
             value=10,
             upper_threshold=5,
         )
     ]
-    assert result.metrics["prompt.text_stat.char_count"][0] == 10
+    assert result.metrics["prompt.stats.char_count"][0] == 10
diff --git a/tests/langkit/metrics/test_library.py b/tests/langkit/metrics/test_library.py
index 3dc9f2a..e09b14d 100644
--- a/tests/langkit/metrics/test_library.py
+++ b/tests/langkit/metrics/test_library.py
@@ -19,7 +19,7 @@ def test_recommended():
         "prompt.pii.us_bank_number",
         "prompt.pii.redacted",
         "prompt.stats.token_count",
-        "prompt.text_stat.char_count",
+        "prompt.stats.char_count",
         "prompt.similarity.injection",
         "prompt.similarity.jailbreak",
         "response.pii.phone_number",
@@ -29,8 +29,8 @@ def test_recommended():
         "response.pii.us_bank_number",
         "response.pii.redacted",
         "response.stats.token_count",
-        "response.text_stat.char_count",
-        "response.text_stat.flesch_reading_ease",
+        "response.stats.char_count",
+        "response.stats.flesch_reading_ease",
         "response.sentiment.sentiment_score",
         "response.toxicity",
         "response.similarity.refusal",
diff --git a/tests/langkit/metrics/test_metric_library.py b/tests/langkit/metrics/test_metric_library.py
deleted file mode 100644
index c0f5516..0000000
--- a/tests/langkit/metrics/test_metric_library.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from langkit.core.workflow import EvaluationWorkflow
-from langkit.metrics.library import lib
-
-
-def test_recommended():
-    row = {"prompt": "Hi, how are you doing today?", "response": "I'm doing great, how about you?"}
-    wf = EvaluationWorkflow(metrics=[lib.presets.recommended()])
-
-    actual = wf.run(row)
-
-    expected_columns = [
-        "prompt.pii.phone_number",
-        "prompt.pii.email_address",
-        "prompt.pii.credit_card",
-        "prompt.pii.us_ssn",
-        "prompt.pii.us_bank_number",
-        "prompt.pii.redacted",
-        "prompt.stats.token_count",
-        "prompt.text_stat.char_count",
-        "prompt.similarity.injection",
-        "prompt.similarity.jailbreak",
-        "response.pii.phone_number",
-        "response.pii.email_address",
-        "response.pii.credit_card",
-        "response.pii.us_ssn",
-        "response.pii.us_bank_number",
-        "response.pii.redacted",
-        "response.stats.token_count",
-        "response.text_stat.char_count",
-        "response.text_stat.flesch_reading_ease",
-        "response.sentiment.sentiment_score",
-        "response.toxicity",
-        "response.similarity.refusal",
-        "id",
-    ]
-
-    assert list(actual.metrics.columns) == expected_columns
diff --git a/tests/langkit/metrics/test_text_statistics.py b/tests/langkit/metrics/test_text_statistics.py
index 672abd5..cbd2492 100644
--- a/tests/langkit/metrics/test_text_statistics.py
+++ b/tests/langkit/metrics/test_text_statistics.py
@@ -90,34 +90,34 @@ def test_prompt_response_textstat_module():
 
     expected_columns = [
         "prompt",
-        "prompt.text_stat.char_count",
-        "prompt.text_stat.difficult_words",
-        "prompt.text_stat.flesch_kincaid_grade",
-        "prompt.text_stat.flesch_reading_ease",
-        "prompt.text_stat.letter_count",
-        "prompt.text_stat.lexicon_count",
-        "prompt.text_stat.sentence_count",
-        "prompt.text_stat.syllable_count",
+        "prompt.stats.char_count",
+        "prompt.stats.difficult_words",
+        "prompt.stats.flesch_kincaid_grade",
+        "prompt.stats.flesch_reading_ease",
+        "prompt.stats.letter_count",
+        "prompt.stats.lexicon_count",
+        "prompt.stats.sentence_count",
+        "prompt.stats.syllable_count",
         "response",
-        "response.text_stat.char_count",
-        "response.text_stat.difficult_words",
-        "response.text_stat.flesch_kincaid_grade",
-        "response.text_stat.flesch_reading_ease",
-        "response.text_stat.letter_count",
-        "response.text_stat.lexicon_count",
-        "response.text_stat.sentence_count",
-        "response.text_stat.syllable_count",
+        "response.stats.char_count",
+        "response.stats.difficult_words",
+        "response.stats.flesch_kincaid_grade",
+        "response.stats.flesch_reading_ease",
+        "response.stats.letter_count",
+        "response.stats.lexicon_count",
+        "response.stats.sentence_count",
+        "response.stats.syllable_count",
     ]
 
     assert actual.index.tolist() == expected_columns
-    assert actual["distribution/max"]["prompt.text_stat.char_count"] == len(row["prompt"].replace(" ", ""))
-    assert actual["distribution/max"]["response.text_stat.char_count"] == len(row["response"].replace(" ", ""))
+    assert actual["distribution/max"]["prompt.stats.char_count"] == len(row["prompt"].replace(" ", ""))
+    assert actual["distribution/max"]["response.stats.char_count"] == len(row["response"].replace(" ", ""))
 
     actual_row = _log(row, all_textstat_schema)
 
     assert actual_row.index.tolist() == expected_columns
-    assert actual_row["distribution/max"]["prompt.text_stat.char_count"] == len(row["prompt"].replace(" ", ""))
-    assert actual_row["distribution/max"]["response.text_stat.char_count"] == len(row["response"].replace(" ", ""))
+    assert actual_row["distribution/max"]["prompt.stats.char_count"] == len(row["prompt"].replace(" ", ""))
+    assert actual_row["distribution/max"]["response.stats.char_count"] == len(row["response"].replace(" ", ""))
 
 
 def test_prompt_textstat_module():
@@ -129,26 +129,26 @@ def test_prompt_textstat_module():
 
     expected_columns = [
         "prompt",
-        "prompt.text_stat.char_count",
-        "prompt.text_stat.difficult_words",
-        "prompt.text_stat.flesch_kincaid_grade",
-        "prompt.text_stat.flesch_reading_ease",
-        "prompt.text_stat.letter_count",
-        "prompt.text_stat.lexicon_count",
-        "prompt.text_stat.sentence_count",
-        "prompt.text_stat.syllable_count",
+        "prompt.stats.char_count",
+        "prompt.stats.difficult_words",
+        "prompt.stats.flesch_kincaid_grade",
+        "prompt.stats.flesch_reading_ease",
+        "prompt.stats.letter_count",
+        "prompt.stats.lexicon_count",
+        "prompt.stats.sentence_count",
+        "prompt.stats.syllable_count",
         "response",
     ]
 
     assert actual.index.tolist() == expected_columns
-    assert actual["distribution/max"]["prompt.text_stat.char_count"] == len(row["prompt"].replace(" ", ""))
-    assert "response.text_stat.char_count" not in actual["distribution/max"]
+    assert actual["distribution/max"]["prompt.stats.char_count"] == len(row["prompt"].replace(" ", ""))
+    assert "response.stats.char_count" not in actual["distribution/max"]
 
     actual_row = _log(row, prompt_textstat_schema)
 
     assert actual_row.index.tolist() == expected_columns
-    assert actual_row["distribution/max"]["prompt.text_stat.char_count"] == len(row["prompt"].replace(" ", ""))
-    assert "response.text_stat.char_count" not in actual_row["distribution/max"]
+    assert actual_row["distribution/max"]["prompt.stats.char_count"] == len(row["prompt"].replace(" ", ""))
+    assert "response.stats.char_count" not in actual_row["distribution/max"]
 
 
 def test_response_textstat_module():
@@ -161,25 +161,25 @@ def test_response_textstat_module():
     expected_columns = [
         "prompt",
         "response",
-        "response.text_stat.char_count",
-        "response.text_stat.difficult_words",
-        "response.text_stat.flesch_kincaid_grade",
-        "response.text_stat.flesch_reading_ease",
-        "response.text_stat.letter_count",
-        "response.text_stat.lexicon_count",
-        "response.text_stat.sentence_count",
-        "response.text_stat.syllable_count",
+        "response.stats.char_count",
+        "response.stats.difficult_words",
+        "response.stats.flesch_kincaid_grade",
+        "response.stats.flesch_reading_ease",
+        "response.stats.letter_count",
+        "response.stats.lexicon_count",
+        "response.stats.sentence_count",
+        "response.stats.syllable_count",
     ]
 
     assert actual.index.tolist() == expected_columns
-    assert "prompt.text_stat.char_count" not in actual["distribution/max"]
-    assert actual["distribution/max"]["response.text_stat.char_count"] == len(row["response"].replace(" ", ""))
+    assert "prompt.stats.char_count" not in actual["distribution/max"]
+    assert actual["distribution/max"]["response.stats.char_count"] == len(row["response"].replace(" ", ""))
 
     actual_row = _log(row, response_textstat_schema)
 
     assert actual_row.index.tolist() == expected_columns
-    assert "prompt.text_stat.char_count" not in actual_row["distribution/max"]
-    assert actual_row["distribution/max"]["response.text_stat.char_count"] == len(row["response"].replace(" ", ""))
+    assert "prompt.stats.char_count" not in actual_row["distribution/max"]
+    assert actual_row["distribution/max"]["response.stats.char_count"] == len(row["response"].replace(" ", ""))
 
 
 def test_prompt_reading_ease_module():
@@ -194,7 +194,7 @@ def test_prompt_reading_ease_module():
 
     assert actual.index.tolist() == [
         "prompt",
-        "prompt.text_stat.flesch_reading_ease",
+        "prompt.stats.flesch_reading_ease",
         "response",
     ]
 
@@ -212,7 +212,7 @@ def test_response_reading_ease_module():
     assert actual.index.tolist() == [
         "prompt",
         "response",
-        "response.text_stat.flesch_reading_ease",
+        "response.stats.flesch_reading_ease",
     ]
 
 
@@ -228,9 +228,9 @@ def test_prompt_response_flesch_kincaid_grade_level_module():
 
     assert actual.index.tolist() == [
         "prompt",
-        "prompt.text_stat.flesch_kincaid_grade",
+        "prompt.stats.flesch_kincaid_grade",
         "response",
-        "response.text_stat.flesch_kincaid_grade",
+        "response.stats.flesch_kincaid_grade",
     ]
 
 
@@ -243,7 +243,7 @@ def test_prompt_char_count_module():
 
     assert actual.index.tolist() == [
         "prompt",
-        "prompt.text_stat.char_count",
+        "prompt.stats.char_count",
         "response",
     ]
 
@@ -251,7 +251,7 @@ def test_prompt_char_count_module():
 def test_prompt_char_count_0_module():
     wf = EvaluationWorkflow(
         metrics=[prompt_char_count_metric, response_char_count_metric],
-        validators=[ConstraintValidator("prompt.text_stat.char_count", lower_threshold=2)],
+        validators=[ConstraintValidator("prompt.stats.char_count", lower_threshold=2)],
     )
 
     df = pd.DataFrame(
@@ -267,18 +267,18 @@ def test_prompt_char_count_0_module():
     actual = wf.run(df)
 
     assert actual.metrics.columns.tolist() == [
-        "prompt.text_stat.char_count",
-        "response.text_stat.char_count",
+        "prompt.stats.char_count",
+        "response.stats.char_count",
         "id",
     ]
 
     print(actual.metrics.transpose())
-    assert actual.metrics["prompt.text_stat.char_count"][0] == 0
+    assert actual.metrics["prompt.stats.char_count"][0] == 0
     assert actual.validation_results == ValidationResult(
         report=[
             ValidationFailure(
                 id="0",
-                metric="prompt.text_stat.char_count",
+                metric="prompt.stats.char_count",
                 details="Value 0 is below threshold 2",
                 value=0,
                 upper_threshold=None,
@@ -289,7 +289,7 @@ def test_prompt_char_count_0_module():
 
 
 def test_text_stat_group():
-    wf = EvaluationWorkflow(metrics=[lib.prompt.text_stat()])
+    wf = EvaluationWorkflow(metrics=[lib.prompt.stats()])
     df = pd.DataFrame(
         {
             "prompt": [
@@ -306,26 +306,28 @@ def test_text_stat_group():
     assert sorted(actual.metrics.columns.tolist()) == sorted(  # pyright: ignore[reportUnknownArgumentType]
         [
             "id",
-            "prompt.text_stat.char_count",
-            "prompt.text_stat.difficult_words",
-            "prompt.text_stat.flesch_kincaid_grade",
-            "prompt.text_stat.flesch_reading_ease",
-            "prompt.text_stat.letter_count",
-            "prompt.text_stat.lexicon_count",
-            "prompt.text_stat.sentence_count",
-            "prompt.text_stat.syllable_count",
+            "prompt.stats.char_count",
+            "prompt.stats.difficult_words",
+            "prompt.stats.flesch_kincaid_grade",
+            "prompt.stats.flesch_reading_ease",
+            "prompt.stats.letter_count",
+            "prompt.stats.lexicon_count",
+            "prompt.stats.sentence_count",
+            "prompt.stats.syllable_count",
+            "prompt.stats.token_count",
         ]
     )
 
     print(actual.metrics.transpose())
-    assert actual.metrics["prompt.text_stat.char_count"][0] == 4
-    assert actual.metrics["prompt.text_stat.difficult_words"][0] == 0
-    assert actual.metrics["prompt.text_stat.flesch_kincaid_grade"][0] == -3.5
-    assert actual.metrics["prompt.text_stat.flesch_reading_ease"][0] == 121.22
-    assert actual.metrics["prompt.text_stat.letter_count"][0] == 4
-    assert actual.metrics["prompt.text_stat.lexicon_count"][0] == 1
-    assert actual.metrics["prompt.text_stat.sentence_count"][0] == 1
-    assert actual.metrics["prompt.text_stat.syllable_count"][0] == 1
+    assert actual.metrics["prompt.stats.char_count"][0] == 4
+    assert actual.metrics["prompt.stats.difficult_words"][0] == 0
+    assert actual.metrics["prompt.stats.flesch_kincaid_grade"][0] == -3.5
+    assert actual.metrics["prompt.stats.flesch_reading_ease"][0] == 121.22
+    assert actual.metrics["prompt.stats.letter_count"][0] == 4
+    assert actual.metrics["prompt.stats.lexicon_count"][0] == 1
+    assert actual.metrics["prompt.stats.sentence_count"][0] == 1
+    assert actual.metrics["prompt.stats.syllable_count"][0] == 1
+    assert actual.metrics["prompt.stats.token_count"][0] == 1
 
 
 def test_response_char_count_module():
@@ -338,7 +340,7 @@ def test_response_char_count_module():
     assert actual.index.tolist() == [
         "prompt",
         "response",
-        "response.text_stat.char_count",
+        "response.stats.char_count",
     ]
 
 
@@ -365,18 +367,18 @@ def test_custom_module_combination():
 
     expected_columns = [
         "prompt",
-        "prompt.text_stat.char_count",
-        "prompt.text_stat.difficult_words",
-        "prompt.text_stat.flesch_reading_ease",
+        "prompt.stats.char_count",
+        "prompt.stats.difficult_words",
+        "prompt.stats.flesch_reading_ease",
         "response",
-        "response.text_stat.char_count",
-        "response.text_stat.sentence_count",
+        "response.stats.char_count",
+        "response.stats.sentence_count",
     ]
 
     assert list(actual.columns) == expected_metrics
     assert actual.index.tolist() == expected_columns
-    assert actual["distribution/max"]["prompt.text_stat.char_count"] == len(row["prompt"].replace(" ", ""))
-    assert actual["distribution/max"]["response.text_stat.char_count"] == len(row["response"].replace(" ", ""))
+    assert actual["distribution/max"]["prompt.stats.char_count"] == len(row["prompt"].replace(" ", ""))
+    assert actual["distribution/max"]["response.stats.char_count"] == len(row["response"].replace(" ", ""))
 
     # and you get the same results if you combine the modules in different ways
 
@@ -397,8 +399,8 @@ def test_custom_module_combination():
 
     assert list(actual.columns) == expected_metrics
     assert actual.index.tolist() == expected_columns
-    assert actual["distribution/max"]["prompt.text_stat.char_count"] == len(row["prompt"].replace(" ", ""))
-    assert actual["distribution/max"]["response.text_stat.char_count"] == len(row["response"].replace(" ", ""))
+    assert actual["distribution/max"]["prompt.stats.char_count"] == len(row["prompt"].replace(" ", ""))
+    assert actual["distribution/max"]["response.stats.char_count"] == len(row["response"].replace(" ", ""))
 
 
 def test_multi_text_stat_metric():
@@ -444,12 +446,12 @@ def udf(text: pd.DataFrame) -> MultiMetricResult:
         "prompt",
         "prompt.custom_textstat1",
         "prompt.custom_textstat2",
-        "prompt.text_stat.char_count",
+        "prompt.stats.char_count",
         "response",
     ]
 
     assert actual.index.tolist() == expected_columns
-    assert actual["distribution/max"]["prompt.text_stat.char_count"] == 28
+    assert actual["distribution/max"]["prompt.stats.char_count"] == 28
     assert actual["distribution/min"]["prompt.custom_textstat1"] == 2
     assert actual["distribution/max"]["prompt.custom_textstat1"] == 26
     assert actual["distribution/min"]["prompt.custom_textstat2"] == 4
diff --git a/tests/langkit/validators/test_comparison.py b/tests/langkit/validators/test_comparison.py
index 6cc20b1..de22c59 100644
--- a/tests/langkit/validators/test_comparison.py
+++ b/tests/langkit/validators/test_comparison.py
@@ -8,19 +8,19 @@
 
 def test_one_required():
     with pytest.raises(Exception):
-        ConstraintValidator("prompt.text_stat.char_count")
+        ConstraintValidator("prompt.stats.char_count")
 
 
 def test_upper_threshold():
-    validator = ConstraintValidator("prompt.text_stat.char_count", upper_threshold=5)
-    wf = EvaluationWorkflow(metrics=[metric_lib.prompt.text_stat.char_count], validators=[validator])
+    validator = ConstraintValidator("prompt.stats.char_count", upper_threshold=5)
+    wf = EvaluationWorkflow(metrics=[metric_lib.prompt.stats.char_count], validators=[validator])
 
     result = wf.run({"prompt": "1234567890"})
 
     assert result.validation_results.report == [
         ValidationFailure(
             id="0",
-            metric="prompt.text_stat.char_count",
+            metric="prompt.stats.char_count",
             details="Value 10 is above threshold 5",
             value=10,
             upper_threshold=5,
@@ -29,15 +29,15 @@ def test_upper_threshold():
 
 
 def test_lower_threshold():
-    validator = ConstraintValidator("prompt.text_stat.char_count", lower_threshold=5)
-    wf = EvaluationWorkflow(metrics=[metric_lib.prompt.text_stat.char_count], validators=[validator])
+    validator = ConstraintValidator("prompt.stats.char_count", lower_threshold=5)
+    wf = EvaluationWorkflow(metrics=[metric_lib.prompt.stats.char_count], validators=[validator])
 
     result = wf.run({"prompt": "1"})
 
     assert result.validation_results.report == [
         ValidationFailure(
             id="0",
-            metric="prompt.text_stat.char_count",
+            metric="prompt.stats.char_count",
             details="Value 1 is below threshold 5",
             value=1,
             lower_threshold=5,
@@ -46,15 +46,15 @@ def test_lower_threshold():
 
 
 def test_upper_threshold_inclusive():
-    validator = ConstraintValidator("prompt.text_stat.char_count", upper_threshold_inclusive=5)
-    wf = EvaluationWorkflow(metrics=[metric_lib.prompt.text_stat.char_count], validators=[validator])
+    validator = ConstraintValidator("prompt.stats.char_count", upper_threshold_inclusive=5)
+    wf = EvaluationWorkflow(metrics=[metric_lib.prompt.stats.char_count], validators=[validator])
 
     result = wf.run({"prompt": "12345"})
 
     assert result.validation_results.report == [
         ValidationFailure(
             id="0",
-            metric="prompt.text_stat.char_count",
+            metric="prompt.stats.char_count",
             details="Value 5 is above or equal to threshold 5",
             value=5,
             upper_threshold=5,
@@ -63,15 +63,15 @@ def test_upper_threshold_inclusive():
 
 
 def test_lower_threshold_inclusive():
-    validator = ConstraintValidator("prompt.text_stat.char_count", lower_threshold_inclusive=5)
-    wf = EvaluationWorkflow(metrics=[metric_lib.prompt.text_stat.char_count], validators=[validator])
+    validator = ConstraintValidator("prompt.stats.char_count", lower_threshold_inclusive=5)
+    wf = EvaluationWorkflow(metrics=[metric_lib.prompt.stats.char_count], validators=[validator])
 
     result = wf.run({"prompt": "12345"})
 
     assert result.validation_results.report == [
         ValidationFailure(
             id="0",
-            metric="prompt.text_stat.char_count",
+            metric="prompt.stats.char_count",
             details="Value 5 is below or equal to threshold 5",
             value=5,
             lower_threshold=5,
@@ -80,15 +80,15 @@ def test_lower_threshold_inclusive():
 
 
 def test_one_of():
-    validator = ConstraintValidator("prompt.text_stat.char_count", one_of=[1, 2, 3])
-    wf = EvaluationWorkflow(metrics=[metric_lib.prompt.text_stat.char_count], validators=[validator])
+    validator = ConstraintValidator("prompt.stats.char_count", one_of=[1, 2, 3])
+    wf = EvaluationWorkflow(metrics=[metric_lib.prompt.stats.char_count], validators=[validator])
 
     result = wf.run({"prompt": "asdf"})
 
     assert result.validation_results.report == [
         ValidationFailure(
             id="0",
-            metric="prompt.text_stat.char_count",
+            metric="prompt.stats.char_count",
             details="Value 4 is not in allowed values {1, 2, 3}",
             value=4,
             allowed_values=[1, 2, 3],
@@ -97,15 +97,15 @@ def test_one_of():
 
 
 def test_none_of():
-    validator = ConstraintValidator("prompt.text_stat.char_count", none_of=[1, 2, 3])
-    wf = EvaluationWorkflow(metrics=[metric_lib.prompt.text_stat.char_count], validators=[validator])
+    validator = ConstraintValidator("prompt.stats.char_count", none_of=[1, 2, 3])
+    wf = EvaluationWorkflow(metrics=[metric_lib.prompt.stats.char_count], validators=[validator])
 
     result = wf.run({"prompt": "asd"})
 
     assert result.validation_results.report == [
         ValidationFailure(
             id="0",
-            metric="prompt.text_stat.char_count",
+            metric="prompt.stats.char_count",
             details="Value 3 is in disallowed values {1, 2, 3}",
             value=3,
             disallowed_values=[1, 2, 3],