remove defaults for regular expressions for comments (#314)

fixes #312 * [codegen] remder REs for comments into the tokenizer/buffer and parser * [codegen] do not render a default tokenizer/buffer * [config] do not set defaults for comment REs * [parser] upgrade to the ParserConfig protocol * [test] add unit test for no default comment regexes
neogeny · Oct 13, 2023 · ca3f7e6 · ca3f7e6
1 parent 0a34786
commit ca3f7e6
Show file tree

Hide file tree

Showing 13 changed files with 92 additions and 103 deletions.
diff --git a/mypy.ini b/mypy.ini
@@ -1,4 +1,4 @@
 [mypy]
-python_version = 3.9
+python_version = 3.12
 ignore_missing_imports = True
 exclude = parsers|docs|build|tmp
diff --git a/tatsu/bootstrap.py b/tatsu/bootstrap.py
@@ -9,11 +9,8 @@
 # Any changes you make to it will be overwritten the next time
 # the file is generated.
 
-from __future__ import annotations
-
 import sys
 
-from tatsu.buffering import Buffer
 from tatsu.parsing import Parser
 from tatsu.parsing import tatsumasu
 from tatsu.parsing import leftrec, nomemo, isname # noqa
@@ -26,20 +23,6 @@
 }  # type: ignore
 
 
-class EBNFBootstrapBuffer(Buffer):
-    def __init__(self, text, /, config: ParserConfig | None = None, **settings):
-        config = ParserConfig.new(
-            config,
-            owner=self,
-            nameguard=None,
-            ignorecase=False,
-            namechars='',
-            parseinfo=True,
-        )
-        config = config.replace(**settings)
-        super().__init__(text, config=config)
-
-
 class EBNFBootstrapParser(Parser):
     def __init__(self, /, config: ParserConfig | None = None, **settings):
         config = ParserConfig.new(
@@ -49,6 +32,8 @@ def __init__(self, /, config: ParserConfig | None = None, **settings):
             ignorecase=False,
             namechars='',
             parseinfo=True,
+            comments_re='(?sm)[(][*](?:.|\\n)*?[*][)]',
+            eol_comments_re='#[^\\n]*$',
             keywords=KEYWORDS,
             start='start',
         )

diff --git a/tatsu/codegen/python.py b/tatsu/codegen/python.py
@@ -471,6 +471,8 @@ def render_fields(self, fields):
         left_recursion = self.node.config.left_recursion
         parseinfo = self.node.config.parseinfo
         namechars = repr(self.node.config.namechars or '')
+        comments_re = repr(self.node.config.comments_re)
+        eol_comments_re = repr(self.node.config.eol_comments_re)
 
         rules = '\n'.join([
             self.get_renderer(rule).render() for rule in self.node.rules
@@ -494,6 +496,8 @@ def render_fields(self, fields):
                       parseinfo=parseinfo,
                       keywords=keywords,
                       namechars=namechars,
+                      comments_re=comments_re,
+                      eol_comments_re=eol_comments_re,
                       )
 
     abstract_rule_template = '''
@@ -513,11 +517,8 @@ def {name}(self, ast):  # noqa
                 # Any changes you make to it will be overwritten the next time
                 # the file is generated.
 
-                from __future__ import annotations
-
                 import sys
 
-                from tatsu.buffering import Buffer
                 from tatsu.parsing import Parser
                 from tatsu.parsing import tatsumasu
                 from tatsu.parsing import leftrec, nomemo, isname # noqa
@@ -528,20 +529,6 @@ def {name}(self, ast):  # noqa
                 KEYWORDS = {{{keywords}}}  # type: ignore
 
 
-                class {name}Buffer(Buffer):
-                    def __init__(self, text, /, config: ParserConfig | None = None, **settings):
-                        config = ParserConfig.new(
-                            config,
-                            owner=self,
-                            nameguard={nameguard},
-                            ignorecase={ignorecase},
-                            namechars={namechars},
-                            parseinfo={parseinfo},
-                        )
-                        config = config.replace(**settings)
-                        super().__init__(text, config=config)
-
-
                 class {name}Parser(Parser):
                     def __init__(self, /, config: ParserConfig | None = None, **settings):
                         config = ParserConfig.new(
@@ -551,6 +538,8 @@ def __init__(self, /, config: ParserConfig | None = None, **settings):
                             ignorecase={ignorecase},
                             namechars={namechars},
                             parseinfo={parseinfo},
+                            comments_re={comments_re},
+                            eol_comments_re={eol_comments_re},
                             keywords=KEYWORDS,
                             start={start!r},
                         )

diff --git a/tatsu/grammars.py b/tatsu/grammars.py
@@ -15,7 +15,6 @@
 from .ast import AST
 from .contexts import ParseContext
 from .objectmodel import Node
-from .bootstrap import EBNFBootstrapBuffer
 from .infos import RuleInfo, ParserConfig
 from .leftrec import Nullable, find_left_recursion
 from .collections import OrderedSet as oset
@@ -49,42 +48,6 @@ def pythonize_name(name):
     return ''.join('_' + c.lower() if c.isupper() else c for c in name)
 
 
-class EBNFBuffer(EBNFBootstrapBuffer):
-    def __init__(
-            self, text, filename=None, comments_re=None, eol_comments_re=None, **kwargs):
-        super().__init__(
-            text,
-            filename=filename,
-            memoize_lookaheads=False,
-            comment_recovery=True,
-            comments_re=comments_re,
-            eol_comments_re=eol_comments_re,
-            **kwargs
-        )
-
-    def process_block(self, name, lines, index, **kwargs):
-        i = 0
-        while i < len(lines):
-            line = lines[i]
-            if re.match(PRAGMA_RE, line):
-                directive, arg = line.split('#', 1)[1], ''
-                if '::' in directive:
-                    directive, arg = directive.split('::', 1)
-                directive, arg = directive.strip(), arg.strip()
-                i = self.pragma(name, directive, arg, lines, index, i)
-            else:
-                i += 1
-        return lines, index
-
-    def pragma(self, source, name, arg, lines, index, i):
-        # we only recognize the 'include' pragama
-        if name == 'include':
-            filename = arg.strip('\'"')
-            return self.include_file(source, filename, lines, index, i, i + 1)
-        else:
-            return i + 1  # will be treated as a directive by the parser
-
-
 class ModelContext(ParseContext):
     def __init__(self, rules, /, start=None, config: ParserConfig|None = None, **settings):
         config = ParserConfig.new(config, **settings)

diff --git a/tatsu/infos.py b/tatsu/infos.py
@@ -15,10 +15,6 @@
 from .tokenizing import Tokenizer
 
 
-COMMENTS_RE = r'\(\*((?:.|\n)*?)\*\)'
-EOL_COMMENTS_RE = r'#([^\n]*?)$'
-
-
 @dataclasses.dataclass
 class ParserConfig:
     owner: Any = None
@@ -30,8 +26,8 @@ class ParserConfig:
     start_rule: str|None = None  # FIXME
     rule_name: str|None = None  # Backward compatibility
 
-    comments_re: str|None = COMMENTS_RE
-    eol_comments_re: str|None = EOL_COMMENTS_RE
+    comments_re: str|None = None
+    eol_comments_re: str|None = None
 
     tokenizercls: Type[Tokenizer]|None = None  # FIXME
     semantics: Type|None = None

diff --git a/tatsu/parser.py b/tatsu/parser.py
@@ -1,25 +1,67 @@
-from __future__ import annotations
+import re
+from typing import Any
 
-from tatsu.bootstrap import EBNFBootstrapParser
-from tatsu.semantics import ASTSemantics
-from tatsu.parser_semantics import EBNFGrammarSemantics
-from tatsu.grammars import EBNFBuffer
+from .infos import ParserConfig
+from .buffering import Buffer
+from .grammars import PRAGMA_RE
+from .semantics import ASTSemantics
+from .parser_semantics import EBNFGrammarSemantics
+from .bootstrap import EBNFBootstrapParser
+
+
+class EBNFBuffer(Buffer):
+    def __init__(self, text, /, filename=None, config: ParserConfig|None = None, **settings: Any):
+        config = ParserConfig.new(
+            config=config,
+            owner=self,
+            filename=filename,
+            **settings)
+        super().__init__(text, config=config)
+
+    def process_block(self, name, lines, index, **kwargs):
+        i = 0
+        while i < len(lines):
+            line = lines[i]
+            if re.match(PRAGMA_RE, line):
+                directive, arg = line.split('#', 1)[1], ''
+                if '::' in directive:
+                    directive, arg = directive.split('::', 1)
+                directive, arg = directive.strip(), arg.strip()
+                i = self.pragma(name, directive, arg, lines, index, i)
+            else:
+                i += 1
+        return lines, index
+
+    def pragma(self, source, name, arg, lines, index, i):
+        # we only recognize the 'include' pragama
+        if name == 'include':
+            filename = arg.strip('\'"')
+            return self.include_file(source, filename, lines, index, i, i + 1)
+        else:
+            return i + 1  # will be treated as a directive by the parser
 
 
 class EBNFParser(EBNFBootstrapParser):
-    def __init__(self, semantics=None, **kwargs):
+    def __init__(self, name: str | None = None, config: ParserConfig|None = None, semantics=None, **settings: Any):
         if semantics is None:
             semantics = ASTSemantics()
-        super().__init__(semantics=semantics, **kwargs)
+        config = ParserConfig.new(
+            config=config,
+            name=name,
+            semantics=semantics,
+            **settings)
+        super().__init__(config)
 
 
 class GrammarGenerator(EBNFBootstrapParser):
-    def __init__(self, grammar_name=None, semantics=None, parseinfo=True, **kwargs):
+    def __init__(self, name: str | None = None, config: ParserConfig|None = None, semantics=None, **settings: Any):
         if semantics is None:
-            semantics = EBNFGrammarSemantics(grammar_name)
-        super().__init__(
+            semantics = EBNFGrammarSemantics(name)
+        config = ParserConfig.new(
+            config=config,
+            name=name,
             semantics=semantics,
-            parseinfo=parseinfo,
             tokenizercls=EBNFBuffer,
-            **kwargs
+            **settings,
         )
+        super().__init__(config)
diff --git a/test/grammar/alerts_test.py b/test/grammar/alerts_test.py
@@ -5,7 +5,7 @@
 
 def test_alert_interpolation():
     input = '42 69'
-    grammar = '''
+    grammar = r'''
             start = a:number b: number i:^`"seen: {a}, {b}"` $ ;
             number = /\d+/ ;
     '''

diff --git a/test/grammar/constants_test.py b/test/grammar/constants_test.py
@@ -5,7 +5,7 @@
 
 def test_constant_interpolation():
     input = '42 69'
-    grammar = '''
+    grammar = r'''
             start = a:number b: number i:`"seen: {a}, {b}"` $ ;
             number = /\d+/ ;
     '''
@@ -14,7 +14,7 @@ def test_constant_interpolation():
 
 def test_constant_interpolation_free():
     input = '42 69'
-    grammar = '''
+    grammar = r'''
             start = a:number b: number i:`seen: {a}, {b}` $ ;
             number = /\d+/ ;
     '''
@@ -23,7 +23,7 @@ def test_constant_interpolation_free():
 
 def test_constant_interpolation_multiline():
     input = '42 69'
-    grammar = '''
+    grammar = r'''
             start = a:number b: number
             i:```
             seen:

diff --git a/test/grammar/defines_test.py b/test/grammar/defines_test.py
@@ -6,7 +6,7 @@
 
 
 def test_name_in_option():
-    grammar = '''
+    grammar = r'''
         start = expr_range ;
 
         expr_range =
@@ -41,7 +41,7 @@ def test_name_in_option():
 
 
 def test_by_option():
-    grammar = '''
+    grammar = r'''
         start = expr_range ;
 
         expr_range =

diff --git a/test/grammar/keyword_test.py b/test/grammar/keyword_test.py
@@ -127,7 +127,7 @@ def test_sparse_keywords(self):
                 self.assertTrue('"%s" is a reserved word' % k in str(e))
 
     def test_ignorecase_keywords(self):
-        grammar = '''
+        grammar = r'''
             @@ignorecase :: True
             @@keyword :: if
 

diff --git a/test/grammar/syntax_test.py b/test/grammar/syntax_test.py
@@ -1,12 +1,13 @@
 # -*- coding: utf-8 -*-
 import unittest
+import pytest
 
-from tatsu.exceptions import FailedParse
+from tatsu.exceptions import FailedParse, FailedToken
 from tatsu.tool import compile
 from tatsu import tool
 from tatsu.util import trim
 from tatsu.codegen import codegen
-from tatsu.grammars import EBNFBuffer
+from tatsu.parser import EBNFBuffer
 
 
 class SyntaxTests(unittest.TestCase):
@@ -363,3 +364,16 @@ def test_parse_void():
     ast = tool.parse(grammar, '')
     print(ast)
     assert ast is None
+
+
+def test_no_default_comments():
+    grammar = '''
+        start = 'a' $;
+    '''
+
+    text = '''
+        # no comments are valid
+        a
+    '''
+    with pytest.raises(FailedToken):
+        tool.parse(grammar, text)
diff --git a/test/parser_equivalence_test.py b/test/parser_equivalence_test.py
@@ -10,7 +10,7 @@
 """
 OUTPUT = {'number_of_dice': '1', 'sides': '3'}
 
-GRAMMAR = """
+GRAMMAR = r"""
     start = expression $;
 
     int = /-?\d+/ ;
@@ -82,7 +82,7 @@ def test_error_messages():
 
 # @pytest.mark.skip('work in progress')
 def test_name_checked():
-    grammar = '''
+    grammar = r'''
         @@grammar :: Test
         @@ignorecase :: True
         @@keyword :: if
@@ -123,7 +123,7 @@ def test_first_rule():
 
 
 def test_dynamic_compiled_ast():
-    grammar = '''
+    grammar = r'''
         test::Test = 'TEST' ['A' a:number] ['B' b:number] ;
         number::int = /\d+/ ;
     '''

diff --git a/test/parsing_test.py b/test/parsing_test.py
@@ -5,7 +5,7 @@
 
 import tatsu
 from tatsu.util import trim, eval_escapes, asjson
-from tatsu.grammars import EBNFBuffer
+from tatsu.parser import EBNFBuffer
 
 
 class MockIncludeBuffer(EBNFBuffer):