From 7dcbf3a8554884f8dc205ee674d3b386f4f78941 Mon Sep 17 00:00:00 2001
From: Anthony Naddeo <anthony@whylabs.ai>
Date: Tue, 26 Mar 2024 13:46:01 -0700
Subject: [PATCH] Add regex url metric

After doing manual tests, I noticed that the url detection from presidio
does very poorly when code is involved. It has false positives on things
like string.split(). The regex performs much better for less compute for
urls.
---
 langkit/metrics/library.py            | 12 ++++++++++++
 langkit/metrics/regexes/regexes.py    | 20 +++++++++++++++++++-
 tests/langkit/metrics/test_regexes.py | 27 +++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/langkit/metrics/library.py b/langkit/metrics/library.py
index cac7e40..2050bef 100644
--- a/langkit/metrics/library.py
+++ b/langkit/metrics/library.py
@@ -226,6 +226,12 @@ def credit_card_number() -> MetricCreator:
 
                 return prompt_credit_card_number_regex_metric
 
+            @staticmethod
+            def url() -> MetricCreator:
+                from langkit.metrics.regexes.regexes import prompt_url_regex_metric
+
+                return prompt_url_regex_metric
+
         class similarity:
             """
             These metrics are used to compare the response to various examples and use cosine similarity/embedding distances
@@ -427,6 +433,12 @@ def credit_card_number() -> MetricCreator:
 
                 return response_credit_card_number_regex_metric
 
+            @staticmethod
+            def url() -> MetricCreator:
+                from langkit.metrics.regexes.regexes import response_url_regex_metric
+
+                return response_url_regex_metric
+
         class sentiment:
             def __call__(self) -> MetricCreator:
                 return self.sentiment_score()
diff --git a/langkit/metrics/regexes/regexes.py b/langkit/metrics/regexes/regexes.py
index 142dd9e..9ec758a 100644
--- a/langkit/metrics/regexes/regexes.py
+++ b/langkit/metrics/regexes/regexes.py
@@ -1,4 +1,5 @@
 import os
+import re
 from dataclasses import dataclass
 from functools import partial
 from typing import Any, Dict, List, Optional, Union
@@ -6,11 +7,23 @@
 import pandas as pd
 
 from langkit.core.metric import Metric, MetricCreator, SingleMetric, SingleMetricResult, UdfInput
-from langkit.metrics.regexes.regex_loader import CompiledPatternGroups, load_patterns_file
+from langkit.metrics.regexes.regex_loader import CompiledPattern, CompiledPatternGroups, load_patterns_file
 
 __current_module_path = os.path.dirname(__file__)
 __default_pattern_file = os.path.join(__current_module_path, "pattern_groups.json")
 __default_patterns: CompiledPatternGroups = load_patterns_file(__default_pattern_file)
+__default_patterns["patterns"].append(
+    # This regex is too complicated to define in json
+    CompiledPattern(
+        name="url",
+        expressions=[
+            re.compile(
+                r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
+            )
+        ],
+        substitutions=None,
+    )
+)
 
 
 def __sanitize_name_for_metric(pattern_name: str) -> str:
@@ -123,6 +136,7 @@ def udf(text: Union[pd.DataFrame, Dict[str, List[Any]]]) -> SingleMetricResult:
 prompt_phone_number_regex_metric = partial(__single_regex_module, "prompt", __default_patterns, "phone number")
 prompt_mailing_address_regex_metric = partial(__single_regex_module, "prompt", __default_patterns, "mailing address")
 prompt_email_address_regex_metric = partial(__single_regex_module, "prompt", __default_patterns, "email address")
+prompt_url_regex_metric = partial(__single_regex_module, "prompt", __default_patterns, "url")
 
 prompt_regex_metric = [
     prompt_ssn_regex_metric,
@@ -130,6 +144,7 @@ def udf(text: Union[pd.DataFrame, Dict[str, List[Any]]]) -> SingleMetricResult:
     prompt_phone_number_regex_metric,
     prompt_mailing_address_regex_metric,
     prompt_email_address_regex_metric,
+    prompt_url_regex_metric,
 ]
 
 response_ssn_regex_metric = partial(__single_regex_module, "response", __default_patterns, "SSN")
@@ -137,6 +152,7 @@ def udf(text: Union[pd.DataFrame, Dict[str, List[Any]]]) -> SingleMetricResult:
 response_phone_number_regex_metric = partial(__single_regex_module, "response", __default_patterns, "phone number")
 response_mailing_address_regex_metric = partial(__single_regex_module, "response", __default_patterns, "mailing address")
 response_email_address_regex_metric = partial(__single_regex_module, "response", __default_patterns, "email address")
+response_url_regex_metric = partial(__single_regex_module, "response", __default_patterns, "url")
 
 response_regex_metric = [
     response_ssn_regex_metric,
@@ -144,6 +160,7 @@ def udf(text: Union[pd.DataFrame, Dict[str, List[Any]]]) -> SingleMetricResult:
     response_phone_number_regex_metric,
     response_mailing_address_regex_metric,
     response_email_address_regex_metric,
+    response_url_regex_metric,
 ]
 
 prompt_response_ssn_regex_module = [prompt_ssn_regex_metric, response_ssn_regex_metric]
@@ -151,6 +168,7 @@ def udf(text: Union[pd.DataFrame, Dict[str, List[Any]]]) -> SingleMetricResult:
 prompt_response_phone_number_regex_module = [prompt_phone_number_regex_metric, response_phone_number_regex_metric]
 prompt_response_mailing_address_regex_module = [prompt_mailing_address_regex_metric, response_mailing_address_regex_metric]
 prompt_response_email_address_regex_module = [prompt_email_address_regex_metric, response_email_address_regex_metric]
+prompt_response_url_regex_module = [prompt_url_regex_metric, response_url_regex_metric]
 
 
 def custom_regex_metric(column_name: str, file_or_patterns: Optional[Union[str, CompiledPatternGroups]] = None) -> MetricCreator:
diff --git a/tests/langkit/metrics/test_regexes.py b/tests/langkit/metrics/test_regexes.py
index 430a409..2303d4e 100644
--- a/tests/langkit/metrics/test_regexes.py
+++ b/tests/langkit/metrics/test_regexes.py
@@ -10,6 +10,7 @@
 
 import whylogs as why
 from langkit.core.metric import WorkflowMetricConfig, WorkflowMetricConfigBuilder
+from langkit.core.workflow import Workflow
 from langkit.metrics.regexes.regex_loader import CompiledPatternGroups, PatternGroups
 from langkit.metrics.regexes.regexes import (
     get_custom_regex_frequent_items_for_column_module,
@@ -27,12 +28,14 @@
     prompt_response_phone_number_regex_module,
     prompt_response_ssn_regex_module,
     prompt_ssn_regex_metric,
+    prompt_url_regex_metric,
     response_credit_card_number_regex_metric,
     response_default_regexes_module,
     response_email_address_regex_metric,
     response_mailing_address_regex_metric,
     response_phone_number_regex_metric,
     response_ssn_regex_metric,
+    response_url_regex_metric,
 )
 from langkit.metrics.whylogs_compat import create_whylogs_udf_schema
 from whylogs.core.metrics.metrics import FrequentItem
@@ -76,6 +79,30 @@ def _log(item: Any, conf: WorkflowMetricConfig) -> pd.DataFrame:
     return why.log(item, schema=schema).view().to_pandas()  # type: ignore
 
 
+def test_prompt_regex_df_url():
+    df = pd.DataFrame(
+        {
+            "prompt": [
+                "Does this code look good? foo.strip()/10. My blog is at whylabs.ai/foo.html",
+            ],
+            "response": [
+                "Yeah. Nice blog, mine is at whylabs.ai/bar.html",
+            ],
+        }
+    )
+
+    wf = Workflow(metrics=[prompt_url_regex_metric, response_url_regex_metric])
+    result = wf.run(df)
+
+    actual = result.metrics
+
+    expected_columns = ["prompt.regex.url", "response.regex.url", "id"]
+
+    assert list(actual.columns) == expected_columns
+    assert actual["prompt.regex.url"][0] == 1
+    assert actual["response.regex.url"][0] == 1
+
+
 def test_prompt_regex_df_ssn():
     df = pd.DataFrame(
         {