[buffering] optimize use of regular expressions (#328)

* [buffering] optimizations in regex handling
neogeny · Nov 20, 2023 · e9f20be · e9f20be
1 parent 06509b6
commit e9f20be
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 24 deletions.
diff --git a/tatsu/buffering.py b/tatsu/buffering.py
@@ -243,19 +243,25 @@ def _index_comments(self, comments, selector):
                 previous.extend(comments)
 
     def _eat_regex(self, regex):
-        if regex is not None:
-            return list(takewhile(identity, map(self.matchre, repeat(regex))))
-        return None
+        if not regex:
+            return
+        while self._matchre_fast(regex):
+            pass
+
+    def _eat_regex_list(self, regex):
+        if not regex:
+            return []
+        return list(takewhile(identity, map(self.matchre, repeat(regex))))
 
     def eat_whitespace(self):
         return self._eat_regex(self.whitespace_re)
 
     def eat_comments(self):
-        comments = self._eat_regex(self.config.comments_re)
+        comments = self._eat_regex_list(self.config.comments_re)
         self._index_comments(comments, lambda x: x.inline)
 
     def eat_eol_comments(self):
-        comments = self._eat_regex(self.config.eol_comments_re)
+        comments = self._eat_regex_list(self.config.eol_comments_re)
         self._index_comments(comments, lambda x: x.eol)
 
     def next_token(self):
@@ -293,7 +299,7 @@ def is_space(self):
     def is_name_char(self, c):
         return c is not None and (c.isalnum() or c in self._namechar_set)
 
-    def match(self, token):
+    def match(self, token: str) -> str | None:
         if token is None:
             return self.atend()
 
@@ -303,23 +309,31 @@ def match(self, token):
         else:
             is_match = self.text[p: p + len(token)] == token
 
-        if is_match:
-            self.move(len(token))
-            partial_match = (
-                self.nameguard
-                and token
-                and token[0].isalpha()
-                and all(self.is_name_char(t) for t in token)
-                and self.is_name_char(self.current)
-            )
-            if not partial_match:
-                return token
-        self.goto(p)
-        return None
+        if not is_match:
+            return None
+
+        self.move(len(token))
+        partial_match = (
+            self.nameguard
+            and token
+            and token[0].isalpha()
+            and self.is_name_char(self.current)
+            and all(self.is_name_char(t) for t in token)
+        )
+        if partial_match:
+            self.goto(p)
+            return None
+
+        return token
+
+    def _matchre_fast(self, pattern):
+        if not (match := self._scanre(pattern)):
+            return
+
+        self.move(len(match.group()))
 
     def matchre(self, pattern):
-        match = self._scanre(pattern)
-        if match is None:
+        if not (match := self._scanre(pattern)):
             return None
 
         matched = match.group()

diff --git a/tatsu/util/misc.py b/tatsu/util/misc.py
@@ -46,10 +46,8 @@ def match_to_find(m: re.Match):
     g = m.groups(default=m.string[0:0])
     if len(g) == 1:
         return g[0]
-    elif g:
-        return g
     else:
-        return m.group()
+        return g or m.group()
 
 
 def findalliter(pattern, string, pos=None, endpos=None, flags=0):