Skip to content

Commit

Permalink
Added support for PCRE2 regular expressions
Browse files Browse the repository at this point in the history
  • Loading branch information
vruusmann committed Oct 14, 2024
1 parent 8c0e774 commit 1c4524e
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 2 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ lxml==5.1.0
numpy==1.22.4; python_version == '3.8'
numpy==1.26.4; python_version >= '3.9'
optbinning==0.19.0
pcre2==0.4.0
pandas==1.5.3
scikit-learn
scikit-lego==0.7.4
Expand Down
19 changes: 19 additions & 0 deletions sklearn2pmml/preprocessing/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,25 @@ def matches(self, x):
def replace(self, replacement, x):
return self.pattern_.sub(replacement, x)

class PCRE2Engine(RegExEngine):

def __init__(self, pattern):
import pcre2

super(PCRE2Engine, self).__init__(pattern)
self.pattern_ = pcre2.compile(pattern)

def matches(self, x):
scanner = self.pattern_.scan(x)
try:
scanner.__next__()
return True
except StopIteration:
return False

def replace(self, replacement, x):
return self.pattern_.substitute(replacement, x)

def make_regex_engine(pattern):
try:
return PCREEngine(pattern)
Expand Down
18 changes: 16 additions & 2 deletions sklearn2pmml/preprocessing/tests/test_regex.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,19 @@
from sklearn2pmml.preprocessing.regex import REEngine
from sklearn2pmml.preprocessing.regex import PCRE2Engine, REEngine
from unittest import TestCase

class PCRE2EngineTest(TestCase):

def test_matches(self):
engine = PCRE2Engine("ar?y")
self.assertTrue(engine.matches("January"))
self.assertFalse(engine.matches("March"))
self.assertTrue(engine.matches("May"))

def test_replace(self):
engine = PCRE2Engine(r"(\w)")
self.assertEqual("P u p p y", engine.replace(r"$1 ", "Puppy").rstrip())
self.assertEqual(r"\1 \1 \1 \1 \1", engine.replace(r"\1 ", "Puppy").rstrip())

class REEngineTest(TestCase):

def test_matches(self):
Expand All @@ -14,4 +27,5 @@ def test_matches(self):

def test_replace(self):
engine = REEngine(r"(\w)")
self.assertEqual("P u p p y", engine.replace(r"\1 ", "Puppy").strip())
self.assertEqual("$1 $1 $1 $1 $1", engine.replace(r"$1 ", "Puppy").rstrip())
self.assertEqual("P u p p y", engine.replace(r"\1 ", "Puppy").rstrip())

0 comments on commit 1c4524e

Please sign in to comment.