Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stats #257

Merged
merged 3 commits into from
Mar 9, 2024
Merged

Stats #257

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.0.87
current_version = 0.0.88
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
serialize =
Expand Down
3 changes: 1 addition & 2 deletions langkit/core/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,7 @@ def run(self, data: Dict[str, str]) -> EvaluationResult:
def run(self, data: Union[pd.DataFrame, Row, Dict[str, str]]) -> EvaluationResult:
start = time.perf_counter()

if not self._initialized:
self.init()
self.init()

if not isinstance(data, pd.DataFrame):
if not is_dict_with_strings(data):
Expand Down
28 changes: 10 additions & 18 deletions langkit/metrics/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,14 @@ def recommended(prompt: bool = True, response: bool = True) -> MetricCreator:

- prompt.pii.*
- prompt.stats.token_count
- prompt.text_stat.char_count
- prompt.stats.char_count
- prompt.similarity.injection
- prompt.similarity.jailbreak

- response.pii.*
- response.stats.token_count
- response.text_stat.char_count
- response.text_stat.reading_ease
- response.stats.char_count
- response.stats.reading_ease
- response.sentiment.sentiment_score
- response.toxicity.toxicity_score
- response.similarity.refusal
Expand All @@ -64,16 +64,16 @@ def recommended(prompt: bool = True, response: bool = True) -> MetricCreator:
prompt_metrics = [
lib.prompt.pii,
lib.prompt.stats.token_count,
lib.prompt.text_stat.char_count,
lib.prompt.stats.char_count,
lib.prompt.similarity.injection,
lib.prompt.similarity.jailbreak,
]

response_metrics = [
lib.response.pii,
lib.response.stats.token_count,
lib.response.text_stat.char_count,
lib.response.text_stat.reading_ease,
lib.response.stats.char_count,
lib.response.stats.reading_ease,
lib.response.sentiment.sentiment_score,
lib.response.toxicity.toxicity_score,
lib.response.similarity.refusal,
Expand Down Expand Up @@ -117,11 +117,11 @@ def toxicity_score() -> MetricCreator:

return prompt_toxicity_metric

class text_stat:
class stats:
def __call__(self) -> MetricCreator:
from langkit.metrics.text_statistics import prompt_textstat_metric

return prompt_textstat_metric
return [lib.prompt.stats.token_count, prompt_textstat_metric]

@staticmethod
def char_count() -> MetricCreator:
Expand Down Expand Up @@ -171,10 +171,6 @@ def difficult_words() -> MetricCreator:

return prompt_difficult_words_metric

class stats:
def __call__(self) -> MetricCreator:
return [lib.prompt.stats.token_count]

@staticmethod
def token_count(tiktoken_encoding: Optional[str] = None) -> MetricCreator:
"""
Expand Down Expand Up @@ -303,11 +299,11 @@ def toxicity_score() -> MetricCreator:

return response_toxicity_metric

class text_stat:
class stats:
def __call__(self) -> MetricCreator:
from langkit.metrics.text_statistics import response_textstat_metric

return response_textstat_metric
return [lib.response.stats.token_count, response_textstat_metric]

@staticmethod
def char_count() -> MetricCreator:
Expand Down Expand Up @@ -357,10 +353,6 @@ def difficult_words() -> MetricCreator:

return response_difficult_words_metric

class stats:
def __call__(self) -> MetricCreator:
return [lib.response.stats.token_count]

@staticmethod
def token_count(tiktoken_encoding: Optional[str] = None) -> MetricCreator:
"""
Expand Down
2 changes: 1 addition & 1 deletion langkit/metrics/text_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def udf(text: pd.DataFrame) -> SingleMetricResult:
return SingleMetricResult(metrics)

return SingleMetric(
name=f"{column_name}.text_stat.{stat}",
name=f"{column_name}.stats.{stat}",
input_name=column_name,
evaluate=udf,
)
Expand Down
8 changes: 8 additions & 0 deletions langkit/metrics/token.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ def _get_encoder(encoding: str):


def token_metric(column_name: str, encoding: str = "cl100k_base") -> Metric:
def cache_assets():
_get_encoder(encoding)

def init():
_get_encoder(encoding)

def udf(text: pd.DataFrame) -> SingleMetricResult:
encoder = _get_encoder(encoding)
encoding_len = [len(encoder.encode(it)) for it in UdfInput(text).iter_column_rows(column_name)]
Expand All @@ -21,6 +27,8 @@ def udf(text: pd.DataFrame) -> SingleMetricResult:
name=f"{column_name}.stats.token_count",
input_name=column_name,
evaluate=udf,
init=init,
cache_assets=cache_assets,
)


Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langkit"
version = "0.0.87"
version = "0.0.88"
description = "A language toolkit for monitoring LLM interactions"
authors = ["WhyLabs.ai <[email protected]>"]
homepage = "https://docs.whylabs.ai/docs/large-language-model-monitoring"
Expand Down
8 changes: 4 additions & 4 deletions tests/langkit/callbacks/test_webhook.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

def test_webhook_failures_dont_ruin_run():
wf = EvaluationWorkflow(
metrics=[metric_lib.prompt.text_stat.char_count],
validators=[validator_lib.constraint("prompt.text_stat.char_count", upper_threshold=5)],
metrics=[metric_lib.prompt.stats.char_count],
validators=[validator_lib.constraint("prompt.stats.char_count", upper_threshold=5)],
callbacks=[callback_lib.webhook.basic_validation_failure("https://foo.bar")], # will fail, url doesn't exist
)

Expand All @@ -17,10 +17,10 @@ def test_webhook_failures_dont_ruin_run():
assert result.validation_results.report == [
ValidationFailure(
id="0",
metric="prompt.text_stat.char_count",
metric="prompt.stats.char_count",
details="Value 10 is above threshold 5",
value=10,
upper_threshold=5,
)
]
assert result.metrics["prompt.text_stat.char_count"][0] == 10
assert result.metrics["prompt.stats.char_count"][0] == 10
6 changes: 3 additions & 3 deletions tests/langkit/metrics/test_library.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def test_recommended():
"prompt.pii.us_bank_number",
"prompt.pii.redacted",
"prompt.stats.token_count",
"prompt.text_stat.char_count",
"prompt.stats.char_count",
"prompt.similarity.injection",
"prompt.similarity.jailbreak",
"response.pii.phone_number",
Expand All @@ -29,8 +29,8 @@ def test_recommended():
"response.pii.us_bank_number",
"response.pii.redacted",
"response.stats.token_count",
"response.text_stat.char_count",
"response.text_stat.flesch_reading_ease",
"response.stats.char_count",
"response.stats.flesch_reading_ease",
"response.sentiment.sentiment_score",
"response.toxicity",
"response.similarity.refusal",
Expand Down
37 changes: 0 additions & 37 deletions tests/langkit/metrics/test_metric_library.py

This file was deleted.

Loading
Loading