diff --git a/README.md b/README.md
index bec0009..c3e05b4 100644
--- a/README.md
+++ b/README.md
@@ -26,10 +26,9 @@ As LL(1) grammars operate using `alt` and `seq` primitives, allowing for a match
* Transform `a ::= b+` into `a ::= b b*`
* Transform `a ::= b*` into `a ::= _empty | (b a)`
* Transform `a ::= op1 (op2)` into two rules:
- ```
- a ::= op1 _a_1
- _a_1_ ::= op2
- ```
+
+ a ::= op1 _a_1
+ _a_1_ ::= op2
Of note in this implementation is that the tokenizer and parser are streaming, so that they can process inputs of arbitrary size.
@@ -96,7 +95,7 @@ The {EBNF::Writer} class can be used to write parsed grammars out, either as for
The formatted HTML results are designed to be appropriate for including in specifications.
### Parser Errors
-On a parsing failure, and exception is raised with information that may be useful in determining the source of the error.
+On a parsing failure, an exception is raised with information that may be useful in determining the source of the error.
## EBNF Grammar
The [EBNF][] variant used here is based on [W3C](https://w3.org/) [EBNF][]
@@ -116,7 +115,7 @@ which can also be proceeded by an optional number enclosed in square brackets to
[1] symbol ::= expression
-(Note, this can introduce an ambiguity if the previous rule ends in a range or enum and the current rule has no number. In this case, enclosing `expression` within parentheses, or adding intervening comments can resolve the ambiguity.)
+(Note, introduces an ambiguity if the previous rule ends in a range or enum and the current rule has no number. The parsers dynamically determine the terminal rules for the `LHS` (the identifier, symbol, and `::=`) and `RANGE`).
Symbols are written in CAPITAL CASE if they are the start symbol of a regular language (terminals), otherwise with they are treated as non-terminal rules. Literal strings are quoted.
diff --git a/etc/ebnf.ebnf b/etc/ebnf.ebnf
index 320f1d6..1f8dfce 100644
--- a/etc/ebnf.ebnf
+++ b/etc/ebnf.ebnf
@@ -5,9 +5,8 @@
# Use the LHS terminal to match the identifier, rule name and assignment due to
# confusion between the identifier and RANGE.
- # Note, for grammars not using identifiers, it is still possible to confuse
- # a rule ending with a range the next rule, as it may be interpreted as an identifier.
- # In such case, best to enclose the rule in '()'.
+ # The PEG parser has special rules for matching LHS and RANGE
+ # so that RANGE is not confused with LHS.
[3] rule ::= LHS expression
[4] expression ::= alt
@@ -40,7 +39,7 @@
[13] HEX ::= '#x' ([a-f] | [A-F] | [0-9])+
- [14] RANGE ::= '[' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']' - LHS
+ [14] RANGE ::= '[' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']'
[15] O_RANGE ::= '[^' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']'
diff --git a/etc/ebnf.html b/etc/ebnf.html
index 634f4bc..a640c5e 100644
--- a/etc/ebnf.html
+++ b/etc/ebnf.html
@@ -95,7 +95,7 @@
[14] |
RANGE |
::= |
- '[ ' ( ( R_CHAR '- ' R_CHAR) | ( HEX '- ' HEX) | R_CHAR | HEX) + '- '? ( '] ' - LHS) |
+ '[ ' ( ( R_CHAR '- ' R_CHAR) | ( HEX '- ' HEX) | R_CHAR | HEX) + '- '? '] ' |
[15] |
diff --git a/etc/ebnf.ll1.sxp b/etc/ebnf.ll1.sxp
index 239aebd..1912786 100644
--- a/etc/ebnf.ll1.sxp
+++ b/etc/ebnf.ll1.sxp
@@ -104,10 +104,7 @@
(terminal O_SYMBOL "12a" (plus (alt (range "a-z") (range "A-Z") (range "0-9") '_' '.')))
(terminal HEX "13" (seq '#x' (plus (alt (range "a-f") (range "A-F") (range "0-9")))))
(terminal RANGE "14"
- (seq '['
- (plus (alt (seq R_CHAR '-' R_CHAR) (seq HEX '-' HEX) R_CHAR HEX))
- (opt '-')
- (diff ']' LHS)) )
+ (seq '[' (plus (alt (seq R_CHAR '-' R_CHAR) (seq HEX '-' HEX) R_CHAR HEX)) (opt '-') ']'))
(terminal O_RANGE "15"
(seq '[^' (plus (alt (seq R_CHAR '-' R_CHAR) (seq HEX '-' HEX) R_CHAR HEX)) (opt '-') ']'))
(terminal STRING1 "16" (seq '"' (star (diff CHAR '"')) '"'))
diff --git a/etc/ebnf.peg.rb b/etc/ebnf.peg.rb
index c319b09..315e2f6 100644
--- a/etc/ebnf.peg.rb
+++ b/etc/ebnf.peg.rb
@@ -38,13 +38,12 @@ module EBNFMeta
EBNF::Rule.new(:_HEX_3, "13.3", [:range, "a-f"], kind: :terminal).extend(EBNF::PEG::Rule),
EBNF::Rule.new(:_HEX_4, "13.4", [:range, "A-F"], kind: :terminal).extend(EBNF::PEG::Rule),
EBNF::Rule.new(:_HEX_5, "13.5", [:range, "0-9"], kind: :terminal).extend(EBNF::PEG::Rule),
- EBNF::Rule.new(:RANGE, "14", [:seq, "[", :_RANGE_1, :_RANGE_2, :_RANGE_3], kind: :terminal).extend(EBNF::PEG::Rule),
- EBNF::Rule.new(:_RANGE_1, "14.1", [:plus, :_RANGE_4], kind: :terminal).extend(EBNF::PEG::Rule),
- EBNF::Rule.new(:_RANGE_4, "14.4", [:alt, :_RANGE_5, :_RANGE_6, :R_CHAR, :HEX], kind: :terminal).extend(EBNF::PEG::Rule),
- EBNF::Rule.new(:_RANGE_5, "14.5", [:seq, :R_CHAR, "-", :R_CHAR], kind: :terminal).extend(EBNF::PEG::Rule),
- EBNF::Rule.new(:_RANGE_6, "14.6", [:seq, :HEX, "-", :HEX], kind: :terminal).extend(EBNF::PEG::Rule),
+ EBNF::Rule.new(:RANGE, "14", [:seq, "[", :_RANGE_1, :_RANGE_2, "]"], kind: :terminal).extend(EBNF::PEG::Rule),
+ EBNF::Rule.new(:_RANGE_1, "14.1", [:plus, :_RANGE_3], kind: :terminal).extend(EBNF::PEG::Rule),
+ EBNF::Rule.new(:_RANGE_3, "14.3", [:alt, :_RANGE_4, :_RANGE_5, :R_CHAR, :HEX], kind: :terminal).extend(EBNF::PEG::Rule),
+ EBNF::Rule.new(:_RANGE_4, "14.4", [:seq, :R_CHAR, "-", :R_CHAR], kind: :terminal).extend(EBNF::PEG::Rule),
+ EBNF::Rule.new(:_RANGE_5, "14.5", [:seq, :HEX, "-", :HEX], kind: :terminal).extend(EBNF::PEG::Rule),
EBNF::Rule.new(:_RANGE_2, "14.2", [:opt, "-"], kind: :terminal).extend(EBNF::PEG::Rule),
- EBNF::Rule.new(:_RANGE_3, "14.3", [:diff, "]", :LHS], kind: :terminal).extend(EBNF::PEG::Rule),
EBNF::Rule.new(:O_RANGE, "15", [:seq, "[^", :_O_RANGE_1, :_O_RANGE_2, "]"], kind: :terminal).extend(EBNF::PEG::Rule),
EBNF::Rule.new(:_O_RANGE_1, "15.1", [:plus, :_O_RANGE_3], kind: :terminal).extend(EBNF::PEG::Rule),
EBNF::Rule.new(:_O_RANGE_3, "15.3", [:alt, :_O_RANGE_4, :_O_RANGE_5, :R_CHAR, :HEX], kind: :terminal).extend(EBNF::PEG::Rule),
diff --git a/etc/ebnf.peg.sxp b/etc/ebnf.peg.sxp
index 62ad133..6125475 100644
--- a/etc/ebnf.peg.sxp
+++ b/etc/ebnf.peg.sxp
@@ -35,13 +35,12 @@
(terminal _HEX_3 "13.3" (range "a-f"))
(terminal _HEX_4 "13.4" (range "A-F"))
(terminal _HEX_5 "13.5" (range "0-9"))
- (terminal RANGE "14" (seq '[' _RANGE_1 _RANGE_2 _RANGE_3))
- (terminal _RANGE_1 "14.1" (plus _RANGE_4))
- (terminal _RANGE_4 "14.4" (alt _RANGE_5 _RANGE_6 R_CHAR HEX))
- (terminal _RANGE_5 "14.5" (seq R_CHAR '-' R_CHAR))
- (terminal _RANGE_6 "14.6" (seq HEX '-' HEX))
+ (terminal RANGE "14" (seq '[' _RANGE_1 _RANGE_2 ']'))
+ (terminal _RANGE_1 "14.1" (plus _RANGE_3))
+ (terminal _RANGE_3 "14.3" (alt _RANGE_4 _RANGE_5 R_CHAR HEX))
+ (terminal _RANGE_4 "14.4" (seq R_CHAR '-' R_CHAR))
+ (terminal _RANGE_5 "14.5" (seq HEX '-' HEX))
(terminal _RANGE_2 "14.2" (opt '-'))
- (terminal _RANGE_3 "14.3" (diff ']' LHS))
(terminal O_RANGE "15" (seq '[^' _O_RANGE_1 _O_RANGE_2 ']'))
(terminal _O_RANGE_1 "15.1" (plus _O_RANGE_3))
(terminal _O_RANGE_3 "15.3" (alt _O_RANGE_4 _O_RANGE_5 R_CHAR HEX))
diff --git a/etc/ebnf.sxp b/etc/ebnf.sxp
index 67cca4c..435c333 100644
--- a/etc/ebnf.sxp
+++ b/etc/ebnf.sxp
@@ -16,10 +16,7 @@
(terminal O_SYMBOL "12a" (plus (alt (range "a-z") (range "A-Z") (range "0-9") '_' '.')))
(terminal HEX "13" (seq '#x' (plus (alt (range "a-f") (range "A-F") (range "0-9")))))
(terminal RANGE "14"
- (seq '['
- (plus (alt (seq R_CHAR '-' R_CHAR) (seq HEX '-' HEX) R_CHAR HEX))
- (opt '-')
- (diff ']' LHS)) )
+ (seq '[' (plus (alt (seq R_CHAR '-' R_CHAR) (seq HEX '-' HEX) R_CHAR HEX)) (opt '-') ']'))
(terminal O_RANGE "15"
(seq '[^' (plus (alt (seq R_CHAR '-' R_CHAR) (seq HEX '-' HEX) R_CHAR HEX)) (opt '-') ']'))
(terminal STRING1 "16" (seq '"' (star (diff CHAR '"')) '"'))
diff --git a/lib/ebnf/parser.rb b/lib/ebnf/parser.rb
index 4013559..0dc4d68 100644
--- a/lib/ebnf/parser.rb
+++ b/lib/ebnf/parser.rb
@@ -11,6 +11,12 @@ class Parser
# @return [Array]
attr_reader :ast
+ # Set on first rule
+ attr_reader :lhs_includes_identifier
+
+ # Regular expression to match a [...] range, which may be distinguisehd from an LHS
+ attr_reader :range
+
# ## Terminals
# Define rules for Terminals, placing results on the input stack, making them available to upstream non-Terminal rules.
#
@@ -28,7 +34,22 @@ class Parser
#
# [11] LHS ::= ('[' SYMBOL+ ']' ' '+)? SYMBOL >? ' '* '::='
terminal(:LHS, LHS) do |value, prod|
- value.to_s.scan(/(?:\[([^\]]+)\])?\s*(\w+)>?\s*::=/).first
+ md = value.to_s.scan(/(?:\[([^\]]+)\])?\s*(\w+)>?\s*::=/).first
+ if @lhs_includes_identifier.nil?
+ @lhs_includes_identifier = !md[0].nil?
+ @range = md[0] ? RANGE_NOT_LHS : RANGE
+ elsif @lhs_includes_identifier && !md[0]
+ error("LHS",
+ "Rule does not begin with a [xxx] identifier, which was established on the first rule",
+ production: :LHS,
+ rest: value)
+ elsif !@lhs_includes_identifier && md[0]
+ error("LHS",
+ "Rule begins with a [xxx] identifier, which was not established on the first rule",
+ production: :LHS,
+ rest: value)
+ end
+ md
end
# Match `SYMBOL` terminal
@@ -48,9 +69,10 @@ class Parser
end
# Terminal for `RANGE` is matched as part of a `primary` rule.
+ # Note that this won't match if rules include identifiers.
#
- # [14] RANGE ::= '[' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']' - LHS
- terminal(:RANGE, RANGE) do |value|
+ # [14] RANGE ::= '[' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']'
+ terminal(:RANGE, proc {@range}) do |value|
[:range, value[1..-2]]
end
@@ -130,7 +152,9 @@ class Parser
# Invoke callback
id, sym = value[:LHS]
expression = value[:expression]
- callback.call(:rule, EBNF::Rule.new(sym.to_sym, id, expression))
+ rule = EBNF::Rule.new(sym.to_sym, id, expression)
+ progress(:rule, rule.to_sxp)
+ callback.call(:rule, rule)
nil
end
@@ -274,6 +298,9 @@ def initialize(input, **options, &block)
tap {|x| x.formatter = lambda {|severity, datetime, progname, msg| "#{severity} #{msg}\n"}}
end
+ # This is established on the first rule.
+ self.class.instance_variable_set(:@lhs_includes_identifier, nil)
+
# Read input, if necessary, which will be used in a Scanner.
@input = input.respond_to?(:read) ? input.read : input.to_s
diff --git a/lib/ebnf/peg/parser.rb b/lib/ebnf/peg/parser.rb
index 40b0bb8..9fe39ed 100644
--- a/lib/ebnf/peg/parser.rb
+++ b/lib/ebnf/peg/parser.rb
@@ -68,10 +68,9 @@ def terminal_options; (@terminal_options ||= {}); end
#
# @param [Symbol] term
# The terminal name.
- # @param [Regexp] regexp (nil)
- # Pattern used to scan for this terminal,
- # defaults to the expression defined in the associated rule.
- # If unset, the terminal rule is used for matching.
+ # @param [Regexp, Proc] regexp
+ # Pattern used to scan for this terminal.
+ # Passing a Proc will evaluate that proc to retrieve a regular expression.
# @param [Hash] options
# @option options [Boolean] :unescape
# Cause strings and codepoints to be unescaped.
@@ -83,8 +82,8 @@ def terminal_options; (@terminal_options ||= {}); end
# @yieldparam [Proc] block
# Block passed to initialization for yielding to calling parser.
# Should conform to the yield specs for #initialize
- def terminal(term, regexp = nil, **options, &block)
- terminal_regexps[term] = regexp if regexp
+ def terminal(term, regexp, **options, &block)
+ terminal_regexps[term] = regexp
terminal_handlers[term] = block if block_given?
terminal_options[term] = options.freeze
end
diff --git a/lib/ebnf/peg/rule.rb b/lib/ebnf/peg/rule.rb
index 83ca0a8..4073de9 100644
--- a/lib/ebnf/peg/rule.rb
+++ b/lib/ebnf/peg/rule.rb
@@ -49,6 +49,7 @@ def parse(input, **options)
# use that to match the input,
# otherwise,
if regexp = parser.terminal_regexp(sym)
+ regexp = regexp.call() if regexp.is_a?(Proc)
term_opts = parser.terminal_options(sym)
if matched = input.scan(regexp)
# Optionally map matched
@@ -290,6 +291,7 @@ def rept(input, min, max, prod, string_regexp_opts, **options)
def terminal_also_matches(input, prod, string_regexp_opts)
str_regex = Regexp.new(Regexp.quote(prod), string_regexp_opts)
input.match?(str_regex) && parser.class.terminal_regexps.any? do |sym, re|
+ re = re.call() if re.is_a?(Proc)
(match_len = input.match?(re)) && match_len > prod.length
end
end
diff --git a/lib/ebnf/terminals.rb b/lib/ebnf/terminals.rb
index 1e40b65..861aa02 100644
--- a/lib/ebnf/terminals.rb
+++ b/lib/ebnf/terminals.rb
@@ -1,13 +1,14 @@
# encoding: utf-8
# Terminal definitions for the EBNF grammar
module EBNF::Terminals
- SYMBOL_BASE = %r(\b[a-zA-Z0-9_\.]+\b)u.freeze
- SYMBOL = %r(#{SYMBOL_BASE}>?(?!\s*::=))u.freeze
+ SYMBOL_BASE = %r(\b[a-zA-Z0-9_\.]+\b)u.freeze # Word boundaries
+ SYMBOL = %r((?:#{SYMBOL_BASE}|(?:<#{SYMBOL_BASE}>))(?!\s*::=))u.freeze
HEX = %r(\#x\h+)u.freeze
CHAR = %r([\u0009\u000A\u000D\u0020-\uD7FF\u{10000}-\u{10FFFF}])u.freeze
R_CHAR = %r([\u0009\u000A\u000D\u0020-\u002C\u002E-\u005C\u005E-\uD7FF\u{10000}-\u{10FFFF}])u.freeze
- RANGE = %r(\[(?:(?:#{R_CHAR}\-#{R_CHAR})|(?:#{HEX}\-#{HEX})|#{R_CHAR}|#{HEX})+-?\](?!\s+#{SYMBOL_BASE}>?\s*::=))u.freeze
LHS = %r((?:\[#{SYMBOL_BASE}\])?\s*#{SYMBOL_BASE}>?\s*::=)u.freeze
+ RANGE = %r(\[(?:(?:#{R_CHAR}\-#{R_CHAR})|(?:#{HEX}\-#{HEX})|#{R_CHAR}|#{HEX})+-?\])u.freeze
+ RANGE_NOT_LHS = %r(\[(?:(?:#{R_CHAR}\-#{R_CHAR})|(?:#{HEX}\-#{HEX})|#{R_CHAR}|#{HEX})+-?\](?!\s*#{SYMBOL_BASE}>?\s*::=))u.freeze
O_RANGE = %r(\[\^(?:(?:#{R_CHAR}\-#{R_CHAR})|(?:#{HEX}\-#{HEX}|#{R_CHAR}|#{HEX}))+-?\])u.freeze
STRING1 = %r("[\u0009\u000A\u000D\u0020\u0021\u0023-\uD7FF\u{10000}-\u{10FFFF}]*")u.freeze
STRING2 = %r('[\u0009\u000A\u000D\u0020-\u0026\u0028-\uD7FF\u{10000}-\u{10FFFF}]*')u.freeze
diff --git a/spec/ll1_spec.rb b/spec/ll1_spec.rb
index 1abdec7..cebc6d6 100644
--- a/spec/ll1_spec.rb
+++ b/spec/ll1_spec.rb
@@ -5,6 +5,11 @@
require 'sxp'
describe EBNF::Base do
+ let(:logger) {RDF::Spec.logger}
+ after(:each) do |example|
+ puts logger.to_s if example.exception && !example.exception.is_a?(RSpec::Expectations::ExpectationNotMetError)
+ end
+
describe "#first_follow" do
context "start" do
context "with legitimate start rule" do
@@ -421,11 +426,8 @@
end
def parse(value, **options)
- @debug = []
- options = {debug: @debug}.merge(options)
ebnf = EBNF::Base.new(value, **options)
ebnf.make_bnf
- @debug.clear
ebnf.first_follow(options[:start])
ebnf
end
diff --git a/spec/parser_spec.rb b/spec/parser_spec.rb
index 3213000..db68f14 100644
--- a/spec/parser_spec.rb
+++ b/spec/parser_spec.rb
@@ -43,9 +43,9 @@
'>')))},
],
"minimal whitespace": [
- %{[xx]minimal::=whitespace[yy]whitespace::=PASS},
- %{((rule minimal "xx" (seq whitespace (range "yy")))
- (rule whitespace (seq PASS)))}
+ %{[xx]minimal::=whitespace[yy]whitespace::=" "},
+ %{((rule minimal "xx" (seq whitespace))
+ (rule whitespace "yy" (seq " ")))}
]
}.each do |title, (input, expect)|
it title do
diff --git a/spec/peg/data/parser.rb b/spec/peg/data/parser.rb
index 2fe4807..e462615 100644
--- a/spec/peg/data/parser.rb
+++ b/spec/peg/data/parser.rb
@@ -26,7 +26,7 @@ class EBNFPegParser
terminal(:HEX, HEX)
- terminal(:RANGE, RANGE, unescape: true) do |value|
+ terminal(:RANGE, RANGE_NOT_LHS, unescape: true) do |value|
[:range, value[1..-2]]
end
diff --git a/spec/rule_spec.rb b/spec/rule_spec.rb
index 525b0df..7e6a269 100644
--- a/spec/rule_spec.rb
+++ b/spec/rule_spec.rb
@@ -912,7 +912,7 @@
SYMBOL: [:O_SYMBOL],
O_SYMBOL: [],
HEX: [],
- RANGE: [:R_CHAR, :HEX, :LHS],
+ RANGE: [:R_CHAR, :HEX],
O_RANGE: [:R_CHAR, :HEX],
STRING1: [:CHAR],
STRING2: [:CHAR],