diff --git a/README.md b/README.md index bec0009..c3e05b4 100644 --- a/README.md +++ b/README.md @@ -26,10 +26,9 @@ As LL(1) grammars operate using `alt` and `seq` primitives, allowing for a match * Transform `a ::= b+` into `a ::= b b*` * Transform `a ::= b*` into `a ::= _empty | (b a)` * Transform `a ::= op1 (op2)` into two rules: - ``` - a ::= op1 _a_1 - _a_1_ ::= op2 - ``` + + a ::= op1 _a_1 + _a_1_ ::= op2 Of note in this implementation is that the tokenizer and parser are streaming, so that they can process inputs of arbitrary size. @@ -96,7 +95,7 @@ The {EBNF::Writer} class can be used to write parsed grammars out, either as for The formatted HTML results are designed to be appropriate for including in specifications. ### Parser Errors -On a parsing failure, and exception is raised with information that may be useful in determining the source of the error. +On a parsing failure, an exception is raised with information that may be useful in determining the source of the error. ## EBNF Grammar The [EBNF][] variant used here is based on [W3C](https://w3.org/) [EBNF][] @@ -116,7 +115,7 @@ which can also be proceeded by an optional number enclosed in square brackets to [1] symbol ::= expression -(Note, this can introduce an ambiguity if the previous rule ends in a range or enum and the current rule has no number. In this case, enclosing `expression` within parentheses, or adding intervening comments can resolve the ambiguity.) +(Note, introduces an ambiguity if the previous rule ends in a range or enum and the current rule has no number. The parsers dynamically determine the terminal rules for the `LHS` (the identifier, symbol, and `::=`) and `RANGE`). Symbols are written in CAPITAL CASE if they are the start symbol of a regular language (terminals), otherwise with they are treated as non-terminal rules. Literal strings are quoted. diff --git a/etc/ebnf.ebnf b/etc/ebnf.ebnf index 320f1d6..1f8dfce 100644 --- a/etc/ebnf.ebnf +++ b/etc/ebnf.ebnf @@ -5,9 +5,8 @@ # Use the LHS terminal to match the identifier, rule name and assignment due to # confusion between the identifier and RANGE. - # Note, for grammars not using identifiers, it is still possible to confuse - # a rule ending with a range the next rule, as it may be interpreted as an identifier. - # In such case, best to enclose the rule in '()'. + # The PEG parser has special rules for matching LHS and RANGE + # so that RANGE is not confused with LHS. [3] rule ::= LHS expression [4] expression ::= alt @@ -40,7 +39,7 @@ [13] HEX ::= '#x' ([a-f] | [A-F] | [0-9])+ - [14] RANGE ::= '[' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']' - LHS + [14] RANGE ::= '[' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']' [15] O_RANGE ::= '[^' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']' diff --git a/etc/ebnf.html b/etc/ebnf.html index 634f4bc..a640c5e 100644 --- a/etc/ebnf.html +++ b/etc/ebnf.html @@ -95,7 +95,7 @@ [14] RANGE ::= - '[' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? (']' - LHS) + '[' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']' [15] diff --git a/etc/ebnf.ll1.sxp b/etc/ebnf.ll1.sxp index 239aebd..1912786 100644 --- a/etc/ebnf.ll1.sxp +++ b/etc/ebnf.ll1.sxp @@ -104,10 +104,7 @@ (terminal O_SYMBOL "12a" (plus (alt (range "a-z") (range "A-Z") (range "0-9") '_' '.'))) (terminal HEX "13" (seq '#x' (plus (alt (range "a-f") (range "A-F") (range "0-9"))))) (terminal RANGE "14" - (seq '[' - (plus (alt (seq R_CHAR '-' R_CHAR) (seq HEX '-' HEX) R_CHAR HEX)) - (opt '-') - (diff ']' LHS)) ) + (seq '[' (plus (alt (seq R_CHAR '-' R_CHAR) (seq HEX '-' HEX) R_CHAR HEX)) (opt '-') ']')) (terminal O_RANGE "15" (seq '[^' (plus (alt (seq R_CHAR '-' R_CHAR) (seq HEX '-' HEX) R_CHAR HEX)) (opt '-') ']')) (terminal STRING1 "16" (seq '"' (star (diff CHAR '"')) '"')) diff --git a/etc/ebnf.peg.rb b/etc/ebnf.peg.rb index c319b09..315e2f6 100644 --- a/etc/ebnf.peg.rb +++ b/etc/ebnf.peg.rb @@ -38,13 +38,12 @@ module EBNFMeta EBNF::Rule.new(:_HEX_3, "13.3", [:range, "a-f"], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:_HEX_4, "13.4", [:range, "A-F"], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:_HEX_5, "13.5", [:range, "0-9"], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:RANGE, "14", [:seq, "[", :_RANGE_1, :_RANGE_2, :_RANGE_3], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:_RANGE_1, "14.1", [:plus, :_RANGE_4], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:_RANGE_4, "14.4", [:alt, :_RANGE_5, :_RANGE_6, :R_CHAR, :HEX], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:_RANGE_5, "14.5", [:seq, :R_CHAR, "-", :R_CHAR], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:_RANGE_6, "14.6", [:seq, :HEX, "-", :HEX], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:RANGE, "14", [:seq, "[", :_RANGE_1, :_RANGE_2, "]"], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:_RANGE_1, "14.1", [:plus, :_RANGE_3], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:_RANGE_3, "14.3", [:alt, :_RANGE_4, :_RANGE_5, :R_CHAR, :HEX], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:_RANGE_4, "14.4", [:seq, :R_CHAR, "-", :R_CHAR], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:_RANGE_5, "14.5", [:seq, :HEX, "-", :HEX], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:_RANGE_2, "14.2", [:opt, "-"], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:_RANGE_3, "14.3", [:diff, "]", :LHS], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:O_RANGE, "15", [:seq, "[^", :_O_RANGE_1, :_O_RANGE_2, "]"], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:_O_RANGE_1, "15.1", [:plus, :_O_RANGE_3], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:_O_RANGE_3, "15.3", [:alt, :_O_RANGE_4, :_O_RANGE_5, :R_CHAR, :HEX], kind: :terminal).extend(EBNF::PEG::Rule), diff --git a/etc/ebnf.peg.sxp b/etc/ebnf.peg.sxp index 62ad133..6125475 100644 --- a/etc/ebnf.peg.sxp +++ b/etc/ebnf.peg.sxp @@ -35,13 +35,12 @@ (terminal _HEX_3 "13.3" (range "a-f")) (terminal _HEX_4 "13.4" (range "A-F")) (terminal _HEX_5 "13.5" (range "0-9")) - (terminal RANGE "14" (seq '[' _RANGE_1 _RANGE_2 _RANGE_3)) - (terminal _RANGE_1 "14.1" (plus _RANGE_4)) - (terminal _RANGE_4 "14.4" (alt _RANGE_5 _RANGE_6 R_CHAR HEX)) - (terminal _RANGE_5 "14.5" (seq R_CHAR '-' R_CHAR)) - (terminal _RANGE_6 "14.6" (seq HEX '-' HEX)) + (terminal RANGE "14" (seq '[' _RANGE_1 _RANGE_2 ']')) + (terminal _RANGE_1 "14.1" (plus _RANGE_3)) + (terminal _RANGE_3 "14.3" (alt _RANGE_4 _RANGE_5 R_CHAR HEX)) + (terminal _RANGE_4 "14.4" (seq R_CHAR '-' R_CHAR)) + (terminal _RANGE_5 "14.5" (seq HEX '-' HEX)) (terminal _RANGE_2 "14.2" (opt '-')) - (terminal _RANGE_3 "14.3" (diff ']' LHS)) (terminal O_RANGE "15" (seq '[^' _O_RANGE_1 _O_RANGE_2 ']')) (terminal _O_RANGE_1 "15.1" (plus _O_RANGE_3)) (terminal _O_RANGE_3 "15.3" (alt _O_RANGE_4 _O_RANGE_5 R_CHAR HEX)) diff --git a/etc/ebnf.sxp b/etc/ebnf.sxp index 67cca4c..435c333 100644 --- a/etc/ebnf.sxp +++ b/etc/ebnf.sxp @@ -16,10 +16,7 @@ (terminal O_SYMBOL "12a" (plus (alt (range "a-z") (range "A-Z") (range "0-9") '_' '.'))) (terminal HEX "13" (seq '#x' (plus (alt (range "a-f") (range "A-F") (range "0-9"))))) (terminal RANGE "14" - (seq '[' - (plus (alt (seq R_CHAR '-' R_CHAR) (seq HEX '-' HEX) R_CHAR HEX)) - (opt '-') - (diff ']' LHS)) ) + (seq '[' (plus (alt (seq R_CHAR '-' R_CHAR) (seq HEX '-' HEX) R_CHAR HEX)) (opt '-') ']')) (terminal O_RANGE "15" (seq '[^' (plus (alt (seq R_CHAR '-' R_CHAR) (seq HEX '-' HEX) R_CHAR HEX)) (opt '-') ']')) (terminal STRING1 "16" (seq '"' (star (diff CHAR '"')) '"')) diff --git a/lib/ebnf/parser.rb b/lib/ebnf/parser.rb index 4013559..0dc4d68 100644 --- a/lib/ebnf/parser.rb +++ b/lib/ebnf/parser.rb @@ -11,6 +11,12 @@ class Parser # @return [Array] attr_reader :ast + # Set on first rule + attr_reader :lhs_includes_identifier + + # Regular expression to match a [...] range, which may be distinguisehd from an LHS + attr_reader :range + # ## Terminals # Define rules for Terminals, placing results on the input stack, making them available to upstream non-Terminal rules. # @@ -28,7 +34,22 @@ class Parser # # [11] LHS ::= ('[' SYMBOL+ ']' ' '+)? ? ' '* '::=' terminal(:LHS, LHS) do |value, prod| - value.to_s.scan(/(?:\[([^\]]+)\])?\s*?\s*::=/).first + md = value.to_s.scan(/(?:\[([^\]]+)\])?\s*?\s*::=/).first + if @lhs_includes_identifier.nil? + @lhs_includes_identifier = !md[0].nil? + @range = md[0] ? RANGE_NOT_LHS : RANGE + elsif @lhs_includes_identifier && !md[0] + error("LHS", + "Rule does not begin with a [xxx] identifier, which was established on the first rule", + production: :LHS, + rest: value) + elsif !@lhs_includes_identifier && md[0] + error("LHS", + "Rule begins with a [xxx] identifier, which was not established on the first rule", + production: :LHS, + rest: value) + end + md end # Match `SYMBOL` terminal @@ -48,9 +69,10 @@ class Parser end # Terminal for `RANGE` is matched as part of a `primary` rule. + # Note that this won't match if rules include identifiers. # - # [14] RANGE ::= '[' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']' - LHS - terminal(:RANGE, RANGE) do |value| + # [14] RANGE ::= '[' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']' + terminal(:RANGE, proc {@range}) do |value| [:range, value[1..-2]] end @@ -130,7 +152,9 @@ class Parser # Invoke callback id, sym = value[:LHS] expression = value[:expression] - callback.call(:rule, EBNF::Rule.new(sym.to_sym, id, expression)) + rule = EBNF::Rule.new(sym.to_sym, id, expression) + progress(:rule, rule.to_sxp) + callback.call(:rule, rule) nil end @@ -274,6 +298,9 @@ def initialize(input, **options, &block) tap {|x| x.formatter = lambda {|severity, datetime, progname, msg| "#{severity} #{msg}\n"}} end + # This is established on the first rule. + self.class.instance_variable_set(:@lhs_includes_identifier, nil) + # Read input, if necessary, which will be used in a Scanner. @input = input.respond_to?(:read) ? input.read : input.to_s diff --git a/lib/ebnf/peg/parser.rb b/lib/ebnf/peg/parser.rb index 40b0bb8..9fe39ed 100644 --- a/lib/ebnf/peg/parser.rb +++ b/lib/ebnf/peg/parser.rb @@ -68,10 +68,9 @@ def terminal_options; (@terminal_options ||= {}); end # # @param [Symbol] term # The terminal name. - # @param [Regexp] regexp (nil) - # Pattern used to scan for this terminal, - # defaults to the expression defined in the associated rule. - # If unset, the terminal rule is used for matching. + # @param [Regexp, Proc] regexp + # Pattern used to scan for this terminal. + # Passing a Proc will evaluate that proc to retrieve a regular expression. # @param [Hash] options # @option options [Boolean] :unescape # Cause strings and codepoints to be unescaped. @@ -83,8 +82,8 @@ def terminal_options; (@terminal_options ||= {}); end # @yieldparam [Proc] block # Block passed to initialization for yielding to calling parser. # Should conform to the yield specs for #initialize - def terminal(term, regexp = nil, **options, &block) - terminal_regexps[term] = regexp if regexp + def terminal(term, regexp, **options, &block) + terminal_regexps[term] = regexp terminal_handlers[term] = block if block_given? terminal_options[term] = options.freeze end diff --git a/lib/ebnf/peg/rule.rb b/lib/ebnf/peg/rule.rb index 83ca0a8..4073de9 100644 --- a/lib/ebnf/peg/rule.rb +++ b/lib/ebnf/peg/rule.rb @@ -49,6 +49,7 @@ def parse(input, **options) # use that to match the input, # otherwise, if regexp = parser.terminal_regexp(sym) + regexp = regexp.call() if regexp.is_a?(Proc) term_opts = parser.terminal_options(sym) if matched = input.scan(regexp) # Optionally map matched @@ -290,6 +291,7 @@ def rept(input, min, max, prod, string_regexp_opts, **options) def terminal_also_matches(input, prod, string_regexp_opts) str_regex = Regexp.new(Regexp.quote(prod), string_regexp_opts) input.match?(str_regex) && parser.class.terminal_regexps.any? do |sym, re| + re = re.call() if re.is_a?(Proc) (match_len = input.match?(re)) && match_len > prod.length end end diff --git a/lib/ebnf/terminals.rb b/lib/ebnf/terminals.rb index 1e40b65..861aa02 100644 --- a/lib/ebnf/terminals.rb +++ b/lib/ebnf/terminals.rb @@ -1,13 +1,14 @@ # encoding: utf-8 # Terminal definitions for the EBNF grammar module EBNF::Terminals - SYMBOL_BASE = %r(\b[a-zA-Z0-9_\.]+\b)u.freeze - SYMBOL = %r(?(?!\s*::=))u.freeze + SYMBOL_BASE = %r(\b[a-zA-Z0-9_\.]+\b)u.freeze # Word boundaries + SYMBOL = %r((?:#{SYMBOL_BASE}|(?:<#{SYMBOL_BASE}>))(?!\s*::=))u.freeze HEX = %r(\#x\h+)u.freeze CHAR = %r([\u0009\u000A\u000D\u0020-\uD7FF\u{10000}-\u{10FFFF}])u.freeze R_CHAR = %r([\u0009\u000A\u000D\u0020-\u002C\u002E-\u005C\u005E-\uD7FF\u{10000}-\u{10FFFF}])u.freeze - RANGE = %r(\[(?:(?:#{R_CHAR}\-#{R_CHAR})|(?:#{HEX}\-#{HEX})|#{R_CHAR}|#{HEX})+-?\](?!\s+?\s*::=))u.freeze LHS = %r((?:\[#{SYMBOL_BASE}\])?\s*?\s*::=)u.freeze + RANGE = %r(\[(?:(?:#{R_CHAR}\-#{R_CHAR})|(?:#{HEX}\-#{HEX})|#{R_CHAR}|#{HEX})+-?\])u.freeze + RANGE_NOT_LHS = %r(\[(?:(?:#{R_CHAR}\-#{R_CHAR})|(?:#{HEX}\-#{HEX})|#{R_CHAR}|#{HEX})+-?\](?!\s*?\s*::=))u.freeze O_RANGE = %r(\[\^(?:(?:#{R_CHAR}\-#{R_CHAR})|(?:#{HEX}\-#{HEX}|#{R_CHAR}|#{HEX}))+-?\])u.freeze STRING1 = %r("[\u0009\u000A\u000D\u0020\u0021\u0023-\uD7FF\u{10000}-\u{10FFFF}]*")u.freeze STRING2 = %r('[\u0009\u000A\u000D\u0020-\u0026\u0028-\uD7FF\u{10000}-\u{10FFFF}]*')u.freeze diff --git a/spec/ll1_spec.rb b/spec/ll1_spec.rb index 1abdec7..cebc6d6 100644 --- a/spec/ll1_spec.rb +++ b/spec/ll1_spec.rb @@ -5,6 +5,11 @@ require 'sxp' describe EBNF::Base do + let(:logger) {RDF::Spec.logger} + after(:each) do |example| + puts logger.to_s if example.exception && !example.exception.is_a?(RSpec::Expectations::ExpectationNotMetError) + end + describe "#first_follow" do context "start" do context "with legitimate start rule" do @@ -421,11 +426,8 @@ end def parse(value, **options) - @debug = [] - options = {debug: @debug}.merge(options) ebnf = EBNF::Base.new(value, **options) ebnf.make_bnf - @debug.clear ebnf.first_follow(options[:start]) ebnf end diff --git a/spec/parser_spec.rb b/spec/parser_spec.rb index 3213000..db68f14 100644 --- a/spec/parser_spec.rb +++ b/spec/parser_spec.rb @@ -43,9 +43,9 @@ '>')))}, ], "minimal whitespace": [ - %{[xx]minimal::=whitespace[yy]whitespace::=PASS}, - %{((rule minimal "xx" (seq whitespace (range "yy"))) - (rule whitespace (seq PASS)))} + %{[xx]minimal::=whitespace[yy]whitespace::=" "}, + %{((rule minimal "xx" (seq whitespace)) + (rule whitespace "yy" (seq " ")))} ] }.each do |title, (input, expect)| it title do diff --git a/spec/peg/data/parser.rb b/spec/peg/data/parser.rb index 2fe4807..e462615 100644 --- a/spec/peg/data/parser.rb +++ b/spec/peg/data/parser.rb @@ -26,7 +26,7 @@ class EBNFPegParser terminal(:HEX, HEX) - terminal(:RANGE, RANGE, unescape: true) do |value| + terminal(:RANGE, RANGE_NOT_LHS, unescape: true) do |value| [:range, value[1..-2]] end diff --git a/spec/rule_spec.rb b/spec/rule_spec.rb index 525b0df..7e6a269 100644 --- a/spec/rule_spec.rb +++ b/spec/rule_spec.rb @@ -912,7 +912,7 @@ SYMBOL: [:O_SYMBOL], O_SYMBOL: [], HEX: [], - RANGE: [:R_CHAR, :HEX, :LHS], + RANGE: [:R_CHAR, :HEX], O_RANGE: [:R_CHAR, :HEX], STRING1: [:CHAR], STRING2: [:CHAR],