Skip to content

Commit

Permalink
Set the default regular expression flavour to 're'
Browse files Browse the repository at this point in the history
  • Loading branch information
vruusmann committed Oct 16, 2024
1 parent 3f89161 commit 2e583e9
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 15 deletions.
7 changes: 5 additions & 2 deletions sklearn2pmml/preprocessing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,9 @@ class RegExTransformer(BaseEstimator, TransformerMixin):
def __init__(self, pattern, re_flavour):
super(RegExTransformer, self).__init__()
self.pattern = pattern
re_flavours = ["pcre", "pcre2", "re"]
if re_flavour not in re_flavours:
raise ValueError("Regular Expressions flavour {0} not in {1}".format(re_flavour, re_flavours))
self.re_flavour = re_flavour

def _regex_func(self, regex_engine):
Expand All @@ -592,7 +595,7 @@ def transform(self, X):
class MatchesTransformer(RegExTransformer):
"""Match RE pattern."""

def __init__(self, pattern, re_flavour = None):
def __init__(self, pattern, re_flavour = "re"):
super(MatchesTransformer, self).__init__(pattern = pattern, re_flavour = re_flavour)

def _regex_func(self, regex_engine):
Expand All @@ -604,7 +607,7 @@ def matches(x):
class ReplaceTransformer(RegExTransformer):
"""Replace all RE pattern matches."""

def __init__(self, pattern, replacement, re_flavour = None):
def __init__(self, pattern, replacement, re_flavour = "re"):
super(ReplaceTransformer, self).__init__(pattern = pattern, re_flavour = re_flavour)
self.replacement = replacement

Expand Down
12 changes: 1 addition & 11 deletions sklearn2pmml/preprocessing/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,21 +59,11 @@ def replace(self, replacement, x):
return self.pattern_.substitute(replacement, x)

def make_regex_engine(pattern, re_flavour):
if re_flavour is None:
try:
import pcre2

re_flavour = "pcre2"
except ImportError:
warnings.warn("Perl Compatible Regular Expressions (PCRE) library is not available, falling back to built-in Regular Expressions (RE) library. Transformation results might not be reproducible between Python and PMML environments when using more complex patterns", Warning)
re_flavour = "re"

if re_flavour == "pcre":
return PCREEngine(pattern)
elif re_flavour == "pcre2":
return PCRE2Engine(pattern)
elif re_flavour == "re":
return REEngine(pattern)
else:
re_flavours = ["pcre", "pcre2", "re"]
raise ValueError("Regular Expressions flavour {0} not in {1}".format(re_flavour, re_flavours))
raise ValueError(re_flavour)
4 changes: 2 additions & 2 deletions sklearn2pmml/preprocessing/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -657,14 +657,14 @@ class MatchesTransformerTest(TransformerTest):

def test_transform(self):
X = Series(["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
transformer = MatchesTransformer("ar?y", re_flavour = "re")
transformer = MatchesTransformer("ar?y")
self.assertEqual([True, True, False, False, True, False, False, False, False, False, False, False], self._transform1d(transformer, X).tolist())

class ReplaceTransformerTest(TransformerTest):

def test_transform(self):
X = Series(["A", "B", "BA", "BB", "BAB", "ABBA", "BBBB"])
transformer = ReplaceTransformer("B+", "c", re_flavour = "re")
transformer = ReplaceTransformer("B+", "c")
self.assertEqual(["A", "c", "cA", "c", "cAc", "AcA", "c"], self._transform1d(transformer, X).tolist())
vectorizer = CountVectorizer(token_pattern = r"\w+")
pipeline = make_pipeline(transformer, vectorizer)
Expand Down

0 comments on commit 2e583e9

Please sign in to comment.