From 540f2d32425def88d2f0d4310cb98db7f6cec415 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Laurent=20Th=C3=A9venoux?= Date: Wed, 20 Apr 2022 15:15:39 +0200 Subject: [PATCH] langkit/parsers.py: support Cut in Opt subparser Improve error recovery of incomplete code parsing by allowing Cut parser in Opt ones. TN: S201-022 --- langkit/parsers.py | 70 +++- langkit/templates/parsers/opt_code_ada.mako | 83 ++++- langkit/templates/parsers/row_code_ada.mako | 6 + .../expected_concrete_syntax.lkt | 47 +++ testsuite/tests/grammar/multiple_cuts/main.py | 50 +++ .../tests/grammar/multiple_cuts/test.out | 337 ++++++++++++++++++ testsuite/tests/grammar/multiple_cuts/test.py | 43 +++ .../tests/grammar/multiple_cuts/test.yaml | 1 + 8 files changed, 620 insertions(+), 17 deletions(-) create mode 100644 testsuite/tests/grammar/multiple_cuts/expected_concrete_syntax.lkt create mode 100644 testsuite/tests/grammar/multiple_cuts/main.py create mode 100644 testsuite/tests/grammar/multiple_cuts/test.out create mode 100644 testsuite/tests/grammar/multiple_cuts/test.py create mode 100644 testsuite/tests/grammar/multiple_cuts/test.yaml diff --git a/langkit/parsers.py b/langkit/parsers.py index b9ab3d189..44cc27f4d 100644 --- a/langkit/parsers.py +++ b/langkit/parsers.py @@ -615,6 +615,8 @@ def traverse_nobacktrack( variable if necessary, indicating which parsers should not backtrack. """ if isinstance(self, Cut): + # Do not create a new variable for consecutive Cuts + if nobt: self.no_backtrack = nobt else: @@ -622,11 +624,33 @@ def traverse_nobacktrack( for c in self.children: nobt = c.traverse_nobacktrack(self.no_backtrack) - # Or parsers are a stop point for Cut - if nobt and not isinstance(self, Or): + # Or and Opt parsers are stop points for Cut: + # + # * For Or(A, B, ...) parsers, the effect of a Cut in A/B/... must + # stop when parsing returns from A/B/..., so we do not want the + # no_backtrack variable to be propagated from A to B, etc. and + # from A/B/... to the Or parser itself. + # + # * For Parser(A, Opt(B), Opt(C), ...) parsers, the effect of a Cut + # in A/... must stop when parsing B/C, so we do not want the + # no_backtrack variable to be propagated from A/... to B/C. On + # the other hand, a Cut in B should be propagated to the Parser + # itself, which includes A/... parsers, but not C (i.e. the + # effect of a Cut in B or C must not affect C or B, respectively, + # but only their parent parser Parser). + + if nobt and not isinstance(self, Or) and not isinstance(c, Opt): self.no_backtrack = nobt + # If c is an Opt parser that contains a Cut, the no_backtrack value + # of c will be propagated to self: create a no_backtrack variable + # in self to hold the propagated value if no Cut has been defined + # at this point in self yet. + + if nobt and not self.no_backtrack and isinstance(c, Opt): + self.no_backtrack = VarDef('nobt', T.Bool, reinit=True) + return self.no_backtrack def create_vars_after(self, start_pos: VarDef) -> None: @@ -2385,6 +2409,48 @@ class Cut(Parser): function Foo is -- This function decl will be parsed correctly print("lol") end + + Still in the perspective of better error recovery, a ``Cut`` parser is also + allowed in an ``Opt`` parser in order to prevent backtracking even when an + ``Opt`` parser fails. Here is an example of how to use the ``Cut`` parser + in an ``Opt`` one:: + + body=Body(Opt("scope", identifier), "begin", stmts_list, "end") + + In this case, if we try to parse the input ``"scope begin [stmts] end"``, + it will fail because of the missing ``identifier`` field, the ``Opt`` + parser will backtrack and the ``scope`` keyword will report an error. + Nevertheless, it can be improved thanks to a ``Cut``:: + + body=Body(Opt("scope", Cut(), identifier), "begin", stmts_list, "end") + + Now, the parser will not backtrack and produce an incomplete node, taking + into account the ``Opt`` part. The error will now concern the + ``identifier`` field being absent instead of complaining about the + ``scope`` keyword. This also means that on the simple input: ``"scope"``, + the parser won't backtrack and produce an incomplete ``Body`` node. + + Note that the ``Cut`` parser only applies to the ``Opt`` parser it is + defined in, therefore, the parser will backtrack on the following input: + ``"begin end"``. Here, the parser will fail because of the missing + ``stmts_list`` field. Several ``Cut`` parsers can be used to improve error + recovery in that case. Rewriting the rule as:: + + body=Body(Opt("scope", Cut(), identifier), + "begin", Cut(), stmts_list, "end") + + will allow the parser to properly parse the incomplete input, reporting the + missing ``stmts_list`` field. Moreover, if no ``Cut`` is defined in the + ``Opt`` parser:: + + body=Body(Opt("scope", identifier), "begin", Cut(), stmts_list, "end") + + The ``Cut`` in the ``Body`` parser has no effect in the ``Opt`` part, which + means that the following input: ``"scope begin end"``, will produce a + parsing error and won't recover anything from the ``Opt`` parser: the + ``identifier`` being absent, the ``Opt`` parser will fail and backtrack, + the ``scope`` keyword will be reported as en error, and, the ``begin end`` + will be incompletely parsed (no backtrack because of the ``Cut``). """ def discard(self) -> bool: diff --git a/langkit/templates/parsers/opt_code_ada.mako b/langkit/templates/parsers/opt_code_ada.mako index 9076d81bf..4921c1be8 100644 --- a/langkit/templates/parsers/opt_code_ada.mako +++ b/langkit/templates/parsers/opt_code_ada.mako @@ -12,11 +12,49 @@ if parser._booleanize: alt_true, alt_false = base._alternatives %> +<%def name="no_backtrack_failure()"> + ## Code to execute for error recovery inside the opt parser: set parser + ## position to the last failure position and emit a diagnostic. + if ${parser.no_backtrack} then + ${subparser.pos_var} := Parser.Last_Fail.Pos; + + Append (Parser.Diagnostics, + Sloc_Range (Parser.TDH.all, + Get_Token (Parser.TDH.all, ${subparser.pos_var})), + To_Text ("Cannot parse <${parser.name}>")); + + Add_Last_Fail_Diagnostic (Parser); + end if; + + +<%def name="init_empty_list()"> + ${subparser.res_var} := + ${parser_type.parser_allocator} (Parser.Mem_Pool); + Initialize + (Self => ${parser.res_var}, + Kind => ${parser_type.ada_kind_name}, + Unit => Parser.Unit, + Token_Start_Index => ${parser.start_pos} - 1, + Token_End_Index => No_Token_Index); + Initialize_List + (Self => ${subparser.res_var}, + Parser => Parser, + Count => 0); + + +<%def name="discard_res_var()"> + ${subparser.res_var} := ${parser_type.storage_nullexpr}; + + +<%def name="reset_pos_var()"> + ${subparser.pos_var} := ${parser.start_pos}; + + ${subparser.generate_code()} if ${subparser.pos_var} = No_Token_Index then ## The subparser failed to match the input: produce result for the empty - ## sequence. + ## or incomplete sequence. % if parser._booleanize: % if base.is_bool_type: @@ -31,20 +69,29 @@ if ${subparser.pos_var} = No_Token_Index then Token_End_Index => No_Token_Index); % endif % elif parser_type and parser_type.is_list_type: - ${subparser.res_var} := - ${parser_type.parser_allocator} (Parser.Mem_Pool); - Initialize - (Self => ${parser.res_var}, - Kind => ${parser_type.ada_kind_name}, - Unit => Parser.Unit, - Token_Start_Index => ${parser.start_pos} - 1, - Token_End_Index => No_Token_Index); - Initialize_List - (Self => ${subparser.res_var}, - Parser => Parser, - Count => 0); + % if parser.no_backtrack: + ${no_backtrack_failure()} + + ## Init an empty list if the subparser failed + if ${subparser.res_var} = ${parser_type.storage_nullexpr} then + ${init_empty_list()} + end if; + % else: + ## Backtrack case: discard subparser result (init an empty list) + ${init_empty_list()} + % endif % elif parser_type: - ${subparser.res_var} := ${parser_type.storage_nullexpr}; + % if parser.no_backtrack: + ${no_backtrack_failure()} + + ## Backtrack case: discard subparser result + if not ${parser.no_backtrack} then + ${discard_res_var()} + end if; + % else: + ## Backtrack case: discard subparser result + ${discard_res_var()} + % endif % endif % if parser._is_error: @@ -56,7 +103,13 @@ if ${subparser.pos_var} = No_Token_Index then To_Text ("Missing '${subparser.error_repr}'")); % endif - ${subparser.pos_var} := ${parser.start_pos}; + % if parser.no_backtrack: + if not ${parser.no_backtrack} then + ${reset_pos_var()} + end if; + % else: + ${reset_pos_var()} + % endif % if parser._booleanize: else diff --git a/langkit/templates/parsers/row_code_ada.mako b/langkit/templates/parsers/row_code_ada.mako index a9e3c43ce..ad1aaf71f 100644 --- a/langkit/templates/parsers/row_code_ada.mako +++ b/langkit/templates/parsers/row_code_ada.mako @@ -11,6 +11,12 @@ ${parser.pos_var} := ${parser.start_pos}; ## Parse the element ${subparser.generate_code()} +## Propagate no_backtrack information. If a subparser sets its no_backtrack +## variable, it should propagate the result to its parent. +% if subparser.no_backtrack and parser.no_backtrack: + ${parser.no_backtrack} := ${subparser.no_backtrack}; +% endif + % if parser.progress_var: ${parser.progress_var} := ${num}; % endif diff --git a/testsuite/tests/grammar/multiple_cuts/expected_concrete_syntax.lkt b/testsuite/tests/grammar/multiple_cuts/expected_concrete_syntax.lkt new file mode 100644 index 000000000..28bf50043 --- /dev/null +++ b/testsuite/tests/grammar/multiple_cuts/expected_concrete_syntax.lkt @@ -0,0 +1,47 @@ +import lexer_example + +@with_lexer(foo_lexer) +grammar foo_grammar { + @main_rule stmt_rule <- list*(or(def | var | dot | comma)) + id <- Id(@identifier) + def <- Def( + "def" + / id ?pick("(" / id ")") ?pick("{" / id "}") + ) + var <- Var( + "var" / id ?pick("(" / list+(id, ",") ")") + ) + dot <- Dot( + "." id ?pick("(" / id ")") ?pick("{" / id "}") + ) + comma <- Comma(?pick("(" / id ")") "," id id) +} + +@abstract class FooNode implements Node[FooNode] { +} + +class Comma : FooNode { + @parse_field id1: Id + @parse_field id2: Id + @parse_field id3: Id +} + +class Def : FooNode { + @parse_field id1: Id + @parse_field id2: Id + @parse_field id3: Id +} + +class Dot : FooNode { + @parse_field id1: Id + @parse_field id2: Id + @parse_field id3: Id +} + +class Id : FooNode implements TokenNode { +} + +class Var : FooNode { + @parse_field id: Id + @parse_field ids: ASTList[FooNode, Id] +} diff --git a/testsuite/tests/grammar/multiple_cuts/main.py b/testsuite/tests/grammar/multiple_cuts/main.py new file mode 100644 index 000000000..085298819 --- /dev/null +++ b/testsuite/tests/grammar/multiple_cuts/main.py @@ -0,0 +1,50 @@ +import libfoolang + + +inputs = [ + ('complete case 1', "def a"), + ('complete case 2', "def a (b)"), + ('complete case 3', "def a (b) {c}"), + ('complete case 4', "var a"), + ('complete case 5', "var a (b)"), + ('complete case 6', "var a (b, c, d)"), + ('complete case 7', ". a (b)"), + ('complete case 8', ". a (b) {c}"), + ('complete case 9', ", a b"), + ('complete case 10', "(a) , b c"), + # The def and var rules check that incomplete results are produced + # regarding the presence of several cut parsers. + ('incomplete case 1', "def"), + ('incomplete case 2', "def a (b"), + ('incomplete case 3', "def a (b) {c"), + ('incomplete case 4', "def a ("), + ('incomplete case 5', "def a (b) {"), + ('incomplete case 6', "def a ( {"), + ('incomplete case 7', "def a (b {c"), + ('incomplete case 8', "var"), + ('incomplete case 9', "var a ("), + ('incomplete case 10', "var a ()"), + ('incomplete case 11', "var a (b, c, d"), + # The dot rule checks that an incomplete result is produced if only the + # optional part can set the no_backtracing variable. + ('incomplete case 12', ". a (b"), + ('incomplete case 13', ". a (b) {"), + ('incomplete case 14', ". a ( {"), + # The comma rule is similar to the dot one but the optional part is at the + # beginning of the rule. + ('incomplete case 15', ", b"), + ('incomplete case 16', "(a) , b"), + ('incomplete case 17', "(a , b"), +] + +ctx = libfoolang.AnalysisContext() + +for name, text in inputs: + print(f"=== {name}: {text} ===") + print() + u = ctx.get_from_buffer("buffer", buffer=text) + + for d in u.diagnostics: + print(d) + u.root.dump() + print() diff --git a/testsuite/tests/grammar/multiple_cuts/test.out b/testsuite/tests/grammar/multiple_cuts/test.out new file mode 100644 index 000000000..ca0443bdb --- /dev/null +++ b/testsuite/tests/grammar/multiple_cuts/test.out @@ -0,0 +1,337 @@ +=== complete case 1: def a === + +FooNodeList buffer:1:1-1:6 +|item_0: +| Def buffer:1:1-1:6 +| |id1: +| | Id buffer:1:5-1:6: a +| |id2: None +| |id3: None + +=== complete case 2: def a (b) === + +FooNodeList buffer:1:1-1:10 +|item_0: +| Def buffer:1:1-1:10 +| |id1: +| | Id buffer:1:5-1:6: a +| |id2: +| | Id buffer:1:8-1:9: b +| |id3: None + +=== complete case 3: def a (b) {c} === + +FooNodeList buffer:1:1-1:14 +|item_0: +| Def buffer:1:1-1:14 +| |id1: +| | Id buffer:1:5-1:6: a +| |id2: +| | Id buffer:1:8-1:9: b +| |id3: +| | Id buffer:1:12-1:13: c + +=== complete case 4: var a === + +FooNodeList buffer:1:1-1:6 +|item_0: +| Var buffer:1:1-1:6 +| |id: +| | Id buffer:1:5-1:6: a +| |ids: +| | IdList buffer:1:4-1:4 + +=== complete case 5: var a (b) === + +FooNodeList buffer:1:1-1:10 +|item_0: +| Var buffer:1:1-1:10 +| |id: +| | Id buffer:1:5-1:6: a +| |ids: +| | IdList buffer:1:8-1:9 +| | |item_0: +| | | Id buffer:1:8-1:9: b + +=== complete case 6: var a (b, c, d) === + +FooNodeList buffer:1:1-1:16 +|item_0: +| Var buffer:1:1-1:16 +| |id: +| | Id buffer:1:5-1:6: a +| |ids: +| | IdList buffer:1:8-1:15 +| | |item_0: +| | | Id buffer:1:8-1:9: b +| | |item_1: +| | | Id buffer:1:11-1:12: c +| | |item_2: +| | | Id buffer:1:14-1:15: d + +=== complete case 7: . a (b) === + +FooNodeList buffer:1:1-1:8 +|item_0: +| Dot buffer:1:1-1:8 +| |id1: +| | Id buffer:1:3-1:4: a +| |id2: +| | Id buffer:1:6-1:7: b +| |id3: None + +=== complete case 8: . a (b) {c} === + +FooNodeList buffer:1:1-1:12 +|item_0: +| Dot buffer:1:1-1:12 +| |id1: +| | Id buffer:1:3-1:4: a +| |id2: +| | Id buffer:1:6-1:7: b +| |id3: +| | Id buffer:1:10-1:11: c + +=== complete case 9: , a b === + +FooNodeList buffer:1:1-1:6 +|item_0: +| Comma buffer:1:1-1:6 +| |id1: None +| |id2: +| | Id buffer:1:3-1:4: a +| |id3: +| | Id buffer:1:5-1:6: b + +=== complete case 10: (a) , b c === + +FooNodeList buffer:1:1-1:10 +|item_0: +| Comma buffer:1:1-1:10 +| |id1: +| | Id buffer:1:2-1:3: a +| |id2: +| | Id buffer:1:7-1:8: b +| |id3: +| | Id buffer:1:9-1:10: c + +=== incomplete case 1: def === + +1:1-1:4: Cannot parse +1:4-1:4: Expected Identifier, got Termination +FooNodeList buffer:1:1-1:4 +|item_0: +| Def buffer:1:1-1:4 +| |id1: None +| |id2: None +| |id3: None + +=== incomplete case 2: def a (b === + +1:9-1:9: Cannot parse +1:9-1:9: Expected ')', got Termination +FooNodeList buffer:1:1-1:9 +|item_0: +| Def buffer:1:1-1:9 +| |id1: +| | Id buffer:1:5-1:6: a +| |id2: +| | Id buffer:1:8-1:9: b +| |id3: None + +=== incomplete case 3: def a (b) {c === + +1:13-1:13: Cannot parse +1:13-1:13: Expected '}', got Termination +FooNodeList buffer:1:1-1:13 +|item_0: +| Def buffer:1:1-1:13 +| |id1: +| | Id buffer:1:5-1:6: a +| |id2: +| | Id buffer:1:8-1:9: b +| |id3: +| | Id buffer:1:12-1:13: c + +=== incomplete case 4: def a ( === + +1:8-1:8: Cannot parse +1:8-1:8: Expected Identifier, got Termination +FooNodeList buffer:1:1-1:8 +|item_0: +| Def buffer:1:1-1:8 +| |id1: +| | Id buffer:1:5-1:6: a +| |id2: None +| |id3: None + +=== incomplete case 5: def a (b) { === + +1:12-1:12: Cannot parse +1:12-1:12: Expected Identifier, got Termination +FooNodeList buffer:1:1-1:12 +|item_0: +| Def buffer:1:1-1:12 +| |id1: +| | Id buffer:1:5-1:6: a +| |id2: +| | Id buffer:1:8-1:9: b +| |id3: None + +=== incomplete case 6: def a ( { === + +1:9-1:10: Cannot parse +1:9-1:10: Expected Identifier, got '{' +1:10-1:10: Cannot parse +1:10-1:10: Expected Identifier, got Termination +FooNodeList buffer:1:1-1:10 +|item_0: +| Def buffer:1:1-1:10 +| |id1: +| | Id buffer:1:5-1:6: a +| |id2: None +| |id3: None + +=== incomplete case 7: def a (b {c === + +1:10-1:11: Cannot parse +1:10-1:11: Expected ')', got '{' +1:12-1:12: Cannot parse +1:12-1:12: Expected '}', got Termination +FooNodeList buffer:1:1-1:12 +|item_0: +| Def buffer:1:1-1:12 +| |id1: +| | Id buffer:1:5-1:6: a +| |id2: +| | Id buffer:1:8-1:9: b +| |id3: +| | Id buffer:1:11-1:12: c + +=== incomplete case 8: var === + +1:1-1:4: Cannot parse +1:4-1:4: Expected Identifier, got Termination +FooNodeList buffer:1:1-1:4 +|item_0: +| Var buffer:1:1-1:4 +| |id: None +| |ids: None + +=== incomplete case 9: var a ( === + +1:8-1:8: Cannot parse +1:8-1:8: Expected Identifier, got Termination +FooNodeList buffer:1:1-1:8 +|item_0: +| Var buffer:1:1-1:8 +| |id: +| | Id buffer:1:5-1:6: a +| |ids: +| | IdList buffer:1:8-1:8 + +=== incomplete case 10: var a () === + +1:8-1:9: Cannot parse +1:8-1:9: Expected Identifier, got ')' +1:8-1:9: End of input expected, got "R_Par" +FooNodeList buffer:1:1-1:8 +|item_0: +| Var buffer:1:1-1:8 +| |id: +| | Id buffer:1:5-1:6: a +| |ids: +| | IdList buffer:1:8-1:8 + +=== incomplete case 11: var a (b, c, d === + +1:15-1:15: Cannot parse +1:15-1:15: Expected ')', got Termination +FooNodeList buffer:1:1-1:15 +|item_0: +| Var buffer:1:1-1:15 +| |id: +| | Id buffer:1:5-1:6: a +| |ids: +| | IdList buffer:1:8-1:15 +| | |item_0: +| | | Id buffer:1:8-1:9: b +| | |item_1: +| | | Id buffer:1:11-1:12: c +| | |item_2: +| | | Id buffer:1:14-1:15: d + +=== incomplete case 12: . a (b === + +1:7-1:7: Cannot parse +1:7-1:7: Expected ')', got Termination +FooNodeList buffer:1:1-1:7 +|item_0: +| Dot buffer:1:1-1:7 +| |id1: +| | Id buffer:1:3-1:4: a +| |id2: +| | Id buffer:1:6-1:7: b +| |id3: None + +=== incomplete case 13: . a (b) { === + +1:10-1:10: Cannot parse +1:10-1:10: Expected Identifier, got Termination +FooNodeList buffer:1:1-1:10 +|item_0: +| Dot buffer:1:1-1:10 +| |id1: +| | Id buffer:1:3-1:4: a +| |id2: +| | Id buffer:1:6-1:7: b +| |id3: None + +=== incomplete case 14: . a ( { === + +1:7-1:8: Cannot parse +1:7-1:8: Expected Identifier, got '{' +1:8-1:8: Cannot parse +1:8-1:8: Expected Identifier, got Termination +FooNodeList buffer:1:1-1:8 +|item_0: +| Dot buffer:1:1-1:8 +| |id1: +| | Id buffer:1:3-1:4: a +| |id2: None +| |id3: None + +=== incomplete case 15: , b === + +1:4-1:4: Expected Identifier, got Termination +FooNodeList buffer:1:1-1:1 + +=== incomplete case 16: (a) , b === + +1:1-1:2: Cannot parse +1:8-1:8: Expected Identifier, got Termination +FooNodeList buffer:1:1-1:8 +|item_0: +| Comma buffer:1:1-1:8 +| |id1: +| | Id buffer:1:2-1:3: a +| |id2: +| | Id buffer:1:7-1:8: b +| |id3: None + +=== incomplete case 17: (a , b === + +1:4-1:5: Cannot parse +1:4-1:5: Expected ')', got ',' +1:1-1:2: Cannot parse +1:7-1:7: Expected Identifier, got Termination +FooNodeList buffer:1:1-1:7 +|item_0: +| Comma buffer:1:1-1:7 +| |id1: +| | Id buffer:1:2-1:3: a +| |id2: +| | Id buffer:1:6-1:7: b +| |id3: None + +Done diff --git a/testsuite/tests/grammar/multiple_cuts/test.py b/testsuite/tests/grammar/multiple_cuts/test.py new file mode 100644 index 000000000..0047bdff7 --- /dev/null +++ b/testsuite/tests/grammar/multiple_cuts/test.py @@ -0,0 +1,43 @@ +""" +Test that a rule with multiple cut parsers works correctly. +""" + +from langkit.dsl import ASTNode, Field + +from utils import build_and_run + + +class FooNode(ASTNode): + pass + + +class Def(FooNode): + id1 = Field() + id2 = Field() + id3 = Field() + + +class Dot(FooNode): + id1 = Field() + id2 = Field() + id3 = Field() + + +class Comma(FooNode): + id1 = Field() + id2 = Field() + id3 = Field() + + +class Id(FooNode): + token_node = True + + +class Var(FooNode): + id = Field() + ids = Field() + + +build_and_run(lkt_file='expected_concrete_syntax.lkt', + py_script='main.py') +print('Done') diff --git a/testsuite/tests/grammar/multiple_cuts/test.yaml b/testsuite/tests/grammar/multiple_cuts/test.yaml new file mode 100644 index 000000000..30423a038 --- /dev/null +++ b/testsuite/tests/grammar/multiple_cuts/test.yaml @@ -0,0 +1 @@ +driver: python