From a02f845d29914534160eac89b09c05636601f5d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juancarlo=20A=C3=B1ez?= <apalala@gmail.com>
Date: Sun, 26 Nov 2023 11:34:54 -0400
Subject: [PATCH] Fix incorrect escaping of regular expressions for
 @@whitespace (#331)

fixes #330

* [buffering] do not re.escape regex for whitespace
* [tarsu.grammar] specify missing @@whitespace
* [bootstrap] use generated parser that contains def for whitespace
* [buffering] keep default for whitespace but honor None correctly
* [dis] bump up version for release
---
 docs/directives.rst            |  9 ++++++++-
 docs/syntax.rst                | 14 ++++++++++++++
 grammar/tatsu.ebnf             |  1 +
 tatsu/_version.py              |  2 +-
 tatsu/bootstrap.py             | 10 +++++-----
 tatsu/buffering.py             | 20 ++++++++++++++++----
 tatsu/codegen/python.py        |  6 +++---
 tatsu/infos.py                 |  9 ++++++++-
 tatsu/parser_semantics.py      |  1 +
 tatsu/util/_common.py          |  2 +-
 test/grammar/directive_test.py |  7 +++++++
 11 files changed, 65 insertions(+), 16 deletions(-)
diff --git a/docs/directives.rst b/docs/directives.rst
index 3a173e4e..e255c37b 100644
--- a/docs/directives.rst
+++ b/docs/directives.rst
@@ -109,8 +109,15 @@ Enabling ``@@parseinfo`` will allow precise reporting over the input source-code
 ``@@whitespace :: <regexp>``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Provides a regular expression for the whitespace to be ignored by the parser. It defaults to ``/(?s)\s+/``:
+Provides a regular expression for the whitespace to be ignored by the parser. If no definition is
+provided, then ``r'(?m)\s+'`` will be used as default:
 
 .. code::
 
     @@whitespace :: /[\t ]+/
+
+To disable any parsing of whitespace, use ``None`` for the definition:
+
+.. code::
+
+    @@whitespace :: None
diff --git a/docs/syntax.rst b/docs/syntax.rst
index 6c8da026..7204e274 100644
--- a/docs/syntax.rst
+++ b/docs/syntax.rst
@@ -699,6 +699,20 @@ overwrite the setting in the grammar::
 
   @@whitespace :: /[\t ]+/
 
+If no ``whitespace`` or ``@@whitespace`` is specified, |TatSu| will use
+``r'(?m)\s+'`` as a default. Use ``None`` to have *no whitespace definition*.
+
+
+.. code:: python
+
+   parser = MyParser(text, whitespace=None)
+
+or:
+
+.. code::
+
+    @@whitespace :: None
+
 Case Sensitivity
 ~~~~~~~~~~~~~~~~
 
diff --git a/grammar/tatsu.ebnf b/grammar/tatsu.ebnf
index cd28a41a..340b4a14 100644
--- a/grammar/tatsu.ebnf
+++ b/grammar/tatsu.ebnf
@@ -1,4 +1,5 @@
 @@grammar :: Tatsu
+@@whitespace :: /\s+/
 @@comments :: ?"(?sm)[(][*](?:.|\n)*?[*][)]"
 @@eol_comments :: ?"#[^\n]*$"
 @@parseinfo :: True
diff --git a/tatsu/_version.py b/tatsu/_version.py
index 66b78b93..0a7b07c2 100644
--- a/tatsu/_version.py
+++ b/tatsu/_version.py
@@ -1 +1 @@
-__version__ = '5.10.4b1'
+__version__ = '5.10.4'
diff --git a/tatsu/bootstrap.py b/tatsu/bootstrap.py
index 06a21f15..33e36f36 100644
--- a/tatsu/bootstrap.py
+++ b/tatsu/bootstrap.py
@@ -9,7 +9,7 @@
 # Any changes you make to it will be overwritten the next time
 # the file is generated.
 
-# ruff: noqa: I001, SIM117
+# ruff: noqa: I001, F401, SIM117
 
 import sys
 from pathlib import Path
@@ -17,9 +17,9 @@
 from tatsu.buffering import Buffer
 from tatsu.parsing import Parser
 from tatsu.parsing import tatsumasu
-from tatsu.parsing import leftrec, nomemo, isname  # noqa: F401
+from tatsu.parsing import leftrec, nomemo, isname
 from tatsu.infos import ParserConfig
-from tatsu.util import re, generic_main  # noqa: F401
+from tatsu.util import re, generic_main
 
 
 KEYWORDS: set[str] = set()
@@ -30,7 +30,7 @@ def __init__(self, text, /, config: ParserConfig | None = None, **settings):
         config = ParserConfig.new(
             config,
             owner=self,
-            whitespace=None,
+            whitespace=re.compile(r"\s+"),
             nameguard=None,
             ignorecase=False,
             namechars='',
@@ -49,7 +49,7 @@ def __init__(self, /, config: ParserConfig | None = None, **settings):
         config = ParserConfig.new(
             config,
             owner=self,
-            whitespace=None,
+            whitespace=re.compile(r"\s+"),
             nameguard=None,
             ignorecase=False,
             namechars='',
diff --git a/tatsu/buffering.py b/tatsu/buffering.py
index 1df5e44f..5b3bfa96 100644
--- a/tatsu/buffering.py
+++ b/tatsu/buffering.py
@@ -13,7 +13,14 @@
 from typing import Any
 
 from .exceptions import ParseError
-from .infos import CommentInfo, LineIndexInfo, LineInfo, ParserConfig, PosLine
+from .infos import (
+    CommentInfo,
+    LineIndexInfo,
+    LineInfo,
+    ParserConfig,
+    PosLine,
+    UndefinedStr,
+)
 from .tokenizing import Tokenizer
 from .util import (
     RETYPE,
@@ -74,15 +81,20 @@ def whitespace(self):
 
     @staticmethod
     def build_whitespace_re(whitespace):
-        if whitespace is None:
+        if type(whitespace) is UndefinedStr:
             return WHITESPACE_RE
+        if whitespace in {None, ''}:
+            return None
         elif isinstance(whitespace, RETYPE):
             return whitespace
         elif whitespace:
             if not isinstance(whitespace, str):
+                # FIXME:
+                #   this feature is undocumented
+                #   only regular expressions should be supported
                 # a list or a set?
-                whitespace = ''.join(c for c in whitespace)
-            return re.compile(f'(?m)[{re.escape(whitespace)}]+')
+                whitespace = f"[{''.join(c for c in whitespace)}]+"
+            return re.compile(f'(?m){whitespace}')
         else:
             return None
 
diff --git a/tatsu/codegen/python.py b/tatsu/codegen/python.py
index 2414e756..a8dfc583 100755
--- a/tatsu/codegen/python.py
+++ b/tatsu/codegen/python.py
@@ -509,7 +509,7 @@ def {name}(self, ast):
                 # Any changes you make to it will be overwritten the next time
                 # the file is generated.
 
-                # ruff: noqa: I001, SIM117
+                # ruff: noqa: I001, F401, SIM117
 
                 import sys
                 from pathlib import Path
@@ -517,9 +517,9 @@ def {name}(self, ast):
                 from tatsu.buffering import Buffer
                 from tatsu.parsing import Parser
                 from tatsu.parsing import tatsumasu
-                from tatsu.parsing import leftrec, nomemo, isname  # noqa: F401
+                from tatsu.parsing import leftrec, nomemo, isname
                 from tatsu.infos import ParserConfig
-                from tatsu.util import re, generic_main  # noqa: F401
+                from tatsu.util import re, generic_main
 
 
                 KEYWORDS: set[str] = set({keywords})
diff --git a/tatsu/infos.py b/tatsu/infos.py
index efcbc66b..f6c29932 100644
--- a/tatsu/infos.py
+++ b/tatsu/infos.py
@@ -11,6 +11,13 @@
 from .util.unicode_characters import C_DERIVE
 
 
+class UndefinedStr(str):
+    pass
+
+
+_undefined_str = UndefinedStr()
+
+
 @dataclasses.dataclass
 class ParserConfig:
     owner: Any = None
@@ -48,7 +55,7 @@ class ParserConfig:
     ignorecase: bool | None = False
     namechars: str = ''
     nameguard: bool | None = None  # implied by namechars
-    whitespace: str | None = None
+    whitespace: str | None = _undefined_str
 
     parseinfo: bool = False
 
diff --git a/tatsu/parser_semantics.py b/tatsu/parser_semantics.py
index 946b75da..a5ebba90 100644
--- a/tatsu/parser_semantics.py
+++ b/tatsu/parser_semantics.py
@@ -136,6 +136,7 @@ def grammar(self, ast, *args):
         keywords = list(flatten(ast.keywords)) or []
 
         if directives.get('whitespace') in {'None', 'False'}:
+            # NOTE: use '' because None will _not_ override defaults in configuration
             directives['whitespace'] = ''
 
         name = (
diff --git a/tatsu/util/_common.py b/tatsu/util/_common.py
index 510cd6bd..cf0b8a5e 100644
--- a/tatsu/util/_common.py
+++ b/tatsu/util/_common.py
@@ -27,7 +27,7 @@
 logger.addHandler(ch)
 
 
-WHITESPACE_RE = re.compile(r'(?s)\s+')
+WHITESPACE_RE = re.compile(r'(?m)\s+')
 RETYPE = type(re.compile('.'))
 
 
diff --git a/test/grammar/directive_test.py b/test/grammar/directive_test.py
index a98f485f..024562c9 100644
--- a/test/grammar/directive_test.py
+++ b/test/grammar/directive_test.py
@@ -44,6 +44,13 @@ def test_whitespace_none_directive(self):
             else:
                 self.fail('parsed through non-whitespace')
 
+    def test_default_whitespace(self):
+        grammar = r"""
+            start = {'x'}+ $;
+        """
+
+        tatsu.parse(grammar, "x x x")
+
     def test_eol_comments_re_directive(self):
         grammar = """
             @@eol_comments :: /#.*?$/