whylabs · naddeoa · Mar 25, 2024 · Mar 25, 2024 · Mar 25, 2024 · Mar 25, 2024
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.0.96
+current_version = 0.0.103
 tag = False
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
 serialize = 

diff --git a/langkit/metrics/topic.py b/langkit/metrics/topic.py
@@ -38,6 +38,13 @@ def __get_scores_per_label(
     return scores_per_label
 
 
+def _sanitize_metric_name(topic: str) -> str:
+    """
+    sanitize a metric name created from a topic. Replace white space with underscores, etc.
+    """
+    return topic.replace(" ", "_").lower()
+
+
 def topic_metric(input_name: str, topics: List[str], hypothesis_template: Optional[str] = None) -> MultiMetric:
     hypothesis_template = hypothesis_template or _hypothesis_template
 
@@ -62,7 +69,7 @@ def process_row(row: pd.DataFrame) -> Dict[str, List[Optional[float]]]:
     def cache_assets():
         __classifier.value
 
-    metric_names = [f"{input_name}.topics.{topic}" for topic in topics]
+    metric_names = [f"{input_name}.topics.{_sanitize_metric_name(topic)}" for topic in topics]
     return MultiMetric(names=metric_names, input_name=input_name, evaluate=udf, cache_assets=cache_assets)
 
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langkit"
-version = "0.0.96"
+version = "0.0.103"
 description = "A language toolkit for monitoring LLM interactions"
 authors = ["WhyLabs.ai <[email protected]>"]
 homepage = "https://docs.whylabs.ai/docs/large-language-model-monitoring"

diff --git a/tests/langkit/metrics/test_topic.py b/tests/langkit/metrics/test_topic.py
@@ -172,10 +172,6 @@ def test_topic_library():
 
     topics = ["fishing", "boxing", "hiking", "swimming"]
     wf = Workflow(metrics=[lib.prompt.topics(topics), lib.response.topics(topics)])
-    # schema = WorkflowMetricConfigBuilder().add(lib.prompt.topics(topics)).add(lib.response.topics(topics)).build()
-    # schema = WorkflowMetricConfigBuilder().add(custom_topic_modules.prompt_response_topic_module).build()
-
-    # actual = _log(df, schema)
     result = wf.run(df)
     actual = result.metrics
 
@@ -245,3 +241,34 @@ def test_custom_topic():
     for column in expected_columns:
         if column not in ["prompt", "response"]:
             assert actual.loc[column]["distribution/max"] >= 0.50
+
+
+def test_topic_name_sanitize():
+    df = pd.DataFrame(
+        {
+            "prompt": [
+                "What's the best kind of bait?",
+            ],
+            "response": [
+                "The best kind of bait is worms.",
+            ],
+        }
+    )
+
+    topics = ["Fishing supplies"]
+    wf = Workflow(metrics=[lib.prompt.topics(topics), lib.response.topics(topics)])
+
+    result = wf.run(df)
+    actual = result.metrics
+
+    expected_columns = [
+        "prompt.topics.fishing_supplies",
+        "response.topics.fishing_supplies",
+        "id",
+    ]
+    assert actual.columns.tolist() == expected_columns
+
+    pd.set_option("display.max_columns", None)
+    print(actual.transpose())
+
+    assert actual["prompt.topics.fishing_supplies"][0] > 0.50