From a02f845d29914534160eac89b09c05636601f5d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juancarlo=20A=C3=B1ez?= Date: Sun, 26 Nov 2023 11:34:54 -0400 Subject: [PATCH] Fix incorrect escaping of regular expressions for @@whitespace (#331) fixes #330 * [buffering] do not re.escape regex for whitespace * [tarsu.grammar] specify missing @@whitespace * [bootstrap] use generated parser that contains def for whitespace * [buffering] keep default for whitespace but honor None correctly * [dis] bump up version for release --- docs/directives.rst | 9 ++++++++- docs/syntax.rst | 14 ++++++++++++++ grammar/tatsu.ebnf | 1 + tatsu/_version.py | 2 +- tatsu/bootstrap.py | 10 +++++----- tatsu/buffering.py | 20 ++++++++++++++++---- tatsu/codegen/python.py | 6 +++--- tatsu/infos.py | 9 ++++++++- tatsu/parser_semantics.py | 1 + tatsu/util/_common.py | 2 +- test/grammar/directive_test.py | 7 +++++++ 11 files changed, 65 insertions(+), 16 deletions(-) diff --git a/docs/directives.rst b/docs/directives.rst index 3a173e4e..e255c37b 100644 --- a/docs/directives.rst +++ b/docs/directives.rst @@ -109,8 +109,15 @@ Enabling ``@@parseinfo`` will allow precise reporting over the input source-code ``@@whitespace :: `` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Provides a regular expression for the whitespace to be ignored by the parser. It defaults to ``/(?s)\s+/``: +Provides a regular expression for the whitespace to be ignored by the parser. If no definition is +provided, then ``r'(?m)\s+'`` will be used as default: .. code:: @@whitespace :: /[\t ]+/ + +To disable any parsing of whitespace, use ``None`` for the definition: + +.. code:: + + @@whitespace :: None diff --git a/docs/syntax.rst b/docs/syntax.rst index 6c8da026..7204e274 100644 --- a/docs/syntax.rst +++ b/docs/syntax.rst @@ -699,6 +699,20 @@ overwrite the setting in the grammar:: @@whitespace :: /[\t ]+/ +If no ``whitespace`` or ``@@whitespace`` is specified, |TatSu| will use +``r'(?m)\s+'`` as a default. Use ``None`` to have *no whitespace definition*. + + +.. code:: python + + parser = MyParser(text, whitespace=None) + +or: + +.. code:: + + @@whitespace :: None + Case Sensitivity ~~~~~~~~~~~~~~~~ diff --git a/grammar/tatsu.ebnf b/grammar/tatsu.ebnf index cd28a41a..340b4a14 100644 --- a/grammar/tatsu.ebnf +++ b/grammar/tatsu.ebnf @@ -1,4 +1,5 @@ @@grammar :: Tatsu +@@whitespace :: /\s+/ @@comments :: ?"(?sm)[(][*](?:.|\n)*?[*][)]" @@eol_comments :: ?"#[^\n]*$" @@parseinfo :: True diff --git a/tatsu/_version.py b/tatsu/_version.py index 66b78b93..0a7b07c2 100644 --- a/tatsu/_version.py +++ b/tatsu/_version.py @@ -1 +1 @@ -__version__ = '5.10.4b1' +__version__ = '5.10.4' diff --git a/tatsu/bootstrap.py b/tatsu/bootstrap.py index 06a21f15..33e36f36 100644 --- a/tatsu/bootstrap.py +++ b/tatsu/bootstrap.py @@ -9,7 +9,7 @@ # Any changes you make to it will be overwritten the next time # the file is generated. -# ruff: noqa: I001, SIM117 +# ruff: noqa: I001, F401, SIM117 import sys from pathlib import Path @@ -17,9 +17,9 @@ from tatsu.buffering import Buffer from tatsu.parsing import Parser from tatsu.parsing import tatsumasu -from tatsu.parsing import leftrec, nomemo, isname # noqa: F401 +from tatsu.parsing import leftrec, nomemo, isname from tatsu.infos import ParserConfig -from tatsu.util import re, generic_main # noqa: F401 +from tatsu.util import re, generic_main KEYWORDS: set[str] = set() @@ -30,7 +30,7 @@ def __init__(self, text, /, config: ParserConfig | None = None, **settings): config = ParserConfig.new( config, owner=self, - whitespace=None, + whitespace=re.compile(r"\s+"), nameguard=None, ignorecase=False, namechars='', @@ -49,7 +49,7 @@ def __init__(self, /, config: ParserConfig | None = None, **settings): config = ParserConfig.new( config, owner=self, - whitespace=None, + whitespace=re.compile(r"\s+"), nameguard=None, ignorecase=False, namechars='', diff --git a/tatsu/buffering.py b/tatsu/buffering.py index 1df5e44f..5b3bfa96 100644 --- a/tatsu/buffering.py +++ b/tatsu/buffering.py @@ -13,7 +13,14 @@ from typing import Any from .exceptions import ParseError -from .infos import CommentInfo, LineIndexInfo, LineInfo, ParserConfig, PosLine +from .infos import ( + CommentInfo, + LineIndexInfo, + LineInfo, + ParserConfig, + PosLine, + UndefinedStr, +) from .tokenizing import Tokenizer from .util import ( RETYPE, @@ -74,15 +81,20 @@ def whitespace(self): @staticmethod def build_whitespace_re(whitespace): - if whitespace is None: + if type(whitespace) is UndefinedStr: return WHITESPACE_RE + if whitespace in {None, ''}: + return None elif isinstance(whitespace, RETYPE): return whitespace elif whitespace: if not isinstance(whitespace, str): + # FIXME: + # this feature is undocumented + # only regular expressions should be supported # a list or a set? - whitespace = ''.join(c for c in whitespace) - return re.compile(f'(?m)[{re.escape(whitespace)}]+') + whitespace = f"[{''.join(c for c in whitespace)}]+" + return re.compile(f'(?m){whitespace}') else: return None diff --git a/tatsu/codegen/python.py b/tatsu/codegen/python.py index 2414e756..a8dfc583 100755 --- a/tatsu/codegen/python.py +++ b/tatsu/codegen/python.py @@ -509,7 +509,7 @@ def {name}(self, ast): # Any changes you make to it will be overwritten the next time # the file is generated. - # ruff: noqa: I001, SIM117 + # ruff: noqa: I001, F401, SIM117 import sys from pathlib import Path @@ -517,9 +517,9 @@ def {name}(self, ast): from tatsu.buffering import Buffer from tatsu.parsing import Parser from tatsu.parsing import tatsumasu - from tatsu.parsing import leftrec, nomemo, isname # noqa: F401 + from tatsu.parsing import leftrec, nomemo, isname from tatsu.infos import ParserConfig - from tatsu.util import re, generic_main # noqa: F401 + from tatsu.util import re, generic_main KEYWORDS: set[str] = set({keywords}) diff --git a/tatsu/infos.py b/tatsu/infos.py index efcbc66b..f6c29932 100644 --- a/tatsu/infos.py +++ b/tatsu/infos.py @@ -11,6 +11,13 @@ from .util.unicode_characters import C_DERIVE +class UndefinedStr(str): + pass + + +_undefined_str = UndefinedStr() + + @dataclasses.dataclass class ParserConfig: owner: Any = None @@ -48,7 +55,7 @@ class ParserConfig: ignorecase: bool | None = False namechars: str = '' nameguard: bool | None = None # implied by namechars - whitespace: str | None = None + whitespace: str | None = _undefined_str parseinfo: bool = False diff --git a/tatsu/parser_semantics.py b/tatsu/parser_semantics.py index 946b75da..a5ebba90 100644 --- a/tatsu/parser_semantics.py +++ b/tatsu/parser_semantics.py @@ -136,6 +136,7 @@ def grammar(self, ast, *args): keywords = list(flatten(ast.keywords)) or [] if directives.get('whitespace') in {'None', 'False'}: + # NOTE: use '' because None will _not_ override defaults in configuration directives['whitespace'] = '' name = ( diff --git a/tatsu/util/_common.py b/tatsu/util/_common.py index 510cd6bd..cf0b8a5e 100644 --- a/tatsu/util/_common.py +++ b/tatsu/util/_common.py @@ -27,7 +27,7 @@ logger.addHandler(ch) -WHITESPACE_RE = re.compile(r'(?s)\s+') +WHITESPACE_RE = re.compile(r'(?m)\s+') RETYPE = type(re.compile('.')) diff --git a/test/grammar/directive_test.py b/test/grammar/directive_test.py index a98f485f..024562c9 100644 --- a/test/grammar/directive_test.py +++ b/test/grammar/directive_test.py @@ -44,6 +44,13 @@ def test_whitespace_none_directive(self): else: self.fail('parsed through non-whitespace') + def test_default_whitespace(self): + grammar = r""" + start = {'x'}+ $; + """ + + tatsu.parse(grammar, "x x x") + def test_eol_comments_re_directive(self): grammar = """ @@eol_comments :: /#.*?$/