diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 9f41bf1..7f5b02d 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.0.96 +current_version = 0.0.103 tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? serialize = diff --git a/langkit/metrics/topic.py b/langkit/metrics/topic.py index 8928e90..8b79e56 100644 --- a/langkit/metrics/topic.py +++ b/langkit/metrics/topic.py @@ -38,6 +38,13 @@ def __get_scores_per_label( return scores_per_label +def _sanitize_metric_name(topic: str) -> str: + """ + sanitize a metric name created from a topic. Replace white space with underscores, etc. + """ + return topic.replace(" ", "_").lower() + + def topic_metric(input_name: str, topics: List[str], hypothesis_template: Optional[str] = None) -> MultiMetric: hypothesis_template = hypothesis_template or _hypothesis_template @@ -62,7 +69,7 @@ def process_row(row: pd.DataFrame) -> Dict[str, List[Optional[float]]]: def cache_assets(): __classifier.value - metric_names = [f"{input_name}.topics.{topic}" for topic in topics] + metric_names = [f"{input_name}.topics.{_sanitize_metric_name(topic)}" for topic in topics] return MultiMetric(names=metric_names, input_name=input_name, evaluate=udf, cache_assets=cache_assets) diff --git a/pyproject.toml b/pyproject.toml index ffe2aa6..d1c4e0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langkit" -version = "0.0.96" +version = "0.0.103" description = "A language toolkit for monitoring LLM interactions" authors = ["WhyLabs.ai "] homepage = "https://docs.whylabs.ai/docs/large-language-model-monitoring" diff --git a/tests/langkit/metrics/test_topic.py b/tests/langkit/metrics/test_topic.py index d5340cd..1dfe4b3 100644 --- a/tests/langkit/metrics/test_topic.py +++ b/tests/langkit/metrics/test_topic.py @@ -172,10 +172,6 @@ def test_topic_library(): topics = ["fishing", "boxing", "hiking", "swimming"] wf = Workflow(metrics=[lib.prompt.topics(topics), lib.response.topics(topics)]) - # schema = WorkflowMetricConfigBuilder().add(lib.prompt.topics(topics)).add(lib.response.topics(topics)).build() - # schema = WorkflowMetricConfigBuilder().add(custom_topic_modules.prompt_response_topic_module).build() - - # actual = _log(df, schema) result = wf.run(df) actual = result.metrics @@ -245,3 +241,34 @@ def test_custom_topic(): for column in expected_columns: if column not in ["prompt", "response"]: assert actual.loc[column]["distribution/max"] >= 0.50 + + +def test_topic_name_sanitize(): + df = pd.DataFrame( + { + "prompt": [ + "What's the best kind of bait?", + ], + "response": [ + "The best kind of bait is worms.", + ], + } + ) + + topics = ["Fishing supplies"] + wf = Workflow(metrics=[lib.prompt.topics(topics), lib.response.topics(topics)]) + + result = wf.run(df) + actual = result.metrics + + expected_columns = [ + "prompt.topics.fishing_supplies", + "response.topics.fishing_supplies", + "id", + ] + assert actual.columns.tolist() == expected_columns + + pd.set_option("display.max_columns", None) + print(actual.transpose()) + + assert actual["prompt.topics.fishing_supplies"][0] > 0.50