diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 874f1f3..473bc09 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,7 +19,7 @@ jobs: strategy: fail-fast: false matrix: - ruby: ['3.0', 3.1, 3.2, ruby-head, jruby] + ruby: ['3.0', 3.1, 3.2, 3.3, ruby-head, jruby] steps: - name: Clone repository uses: actions/checkout@v3 @@ -33,6 +33,6 @@ jobs: run: ruby --version; bundle exec rspec spec || $ALLOW_FAILURES - name: Coveralls GitHub Action uses: coverallsapp/github-action@v2 - if: "matrix.ruby == '3.2'" + if: "matrix.ruby == '3.3'" with: github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/Gemfile b/Gemfile index 37b84f8..4da7dec 100644 --- a/Gemfile +++ b/Gemfile @@ -13,6 +13,7 @@ group :development do gem "redcarpet", platforms: :mri gem "rocco", platforms: :mri gem "pygmentize", platforms: :mri + gem 'getoptlong' end group :development, :test do diff --git a/README.md b/README.md index dc0bb1b..c3e05b4 100644 --- a/README.md +++ b/README.md @@ -26,10 +26,9 @@ As LL(1) grammars operate using `alt` and `seq` primitives, allowing for a match * Transform `a ::= b+` into `a ::= b b*` * Transform `a ::= b*` into `a ::= _empty | (b a)` * Transform `a ::= op1 (op2)` into two rules: - ``` - a ::= op1 _a_1 - _a_1_ ::= op2 - ``` + + a ::= op1 _a_1 + _a_1_ ::= op2 Of note in this implementation is that the tokenizer and parser are streaming, so that they can process inputs of arbitrary size. @@ -75,7 +74,7 @@ Generate formatted grammar using HTML (requires [Haml][Haml] gem): ### Parsing an ISO/IEC 14977 Grammar -The EBNF gem can also parse [ISO/EIC 14977] Grammars (ISOEBNF) to [S-Expressions][S-Expression]. +The EBNF gem can also parse [ISO/IEC 14977][] Grammars (ISOEBNF) to [S-Expressions][S-Expression]. grammar = EBNF.parse(File.open('./etc/iso-ebnf.isoebnf'), format: :isoebnf) @@ -96,7 +95,7 @@ The {EBNF::Writer} class can be used to write parsed grammars out, either as for The formatted HTML results are designed to be appropriate for including in specifications. ### Parser Errors -On a parsing failure, and exception is raised with information that may be useful in determining the source of the error. +On a parsing failure, an exception is raised with information that may be useful in determining the source of the error. ## EBNF Grammar The [EBNF][] variant used here is based on [W3C](https://w3.org/) [EBNF][] @@ -104,7 +103,7 @@ The [EBNF][] variant used here is based on [W3C](https://w3.org/) [EBNF][] as defined in the [XML 1.0 recommendation](https://www.w3.org/TR/REC-xml/), with minor extensions: -Note that the grammar includes an optional `[identifer]` in front of rule names, which can be in conflict with the `RANGE` terminal. It is typically not a problem, but if it comes up, try parsing with the `native` parser, add comments or sequences to disambiguate. EBNF does not have beginning of line checks as all whitespace is treated the same, so the common practice of identifying each rule inherently leads to such ambiguity. +Note that the grammar includes an optional `[number]` in front of rule names, which can be in conflict with the `RANGE` terminal. It is typically not a problem, but if it comes up, try parsing with the `native` parser, add comments or sequences to disambiguate. EBNF does not have beginning of line checks as all whitespace is treated the same, so the common practice of identifying each rule inherently leads to such ambiguity. The character set for EBNF is UTF-8. @@ -116,7 +115,7 @@ which can also be proceeded by an optional number enclosed in square brackets to [1] symbol ::= expression -(Note, this can introduce an ambiguity if the previous rule ends in a range or enum and the current rule has no identifier. In this case, enclosing `expression` within parentheses, or adding intervening comments can resolve the ambiguity.) +(Note, introduces an ambiguity if the previous rule ends in a range or enum and the current rule has no number. The parsers dynamically determine the terminal rules for the `LHS` (the identifier, symbol, and `::=`) and `RANGE`). Symbols are written in CAPITAL CASE if they are the start symbol of a regular language (terminals), otherwise with they are treated as non-terminal rules. Literal strings are quoted. @@ -134,7 +133,7 @@ Within the expression on the right-hand side of a rule, the following expression [^abc], [^#xN#xN#xN] matches any UTF-8 R\_CHAR or HEX with a value not among the characters given. The last component may be '-'. Enumerations and ranges of excluded values may be mixed in one set of brackets. "string" - matches a literal string matching that given inside the double quotes. + matches a literal string matching that given inside the double quotes case insensitively. 'string' matches a literal string matching that given inside the single quotes. A (B | C) @@ -158,7 +157,8 @@ Within the expression on the right-hand side of a rule, the following expression * Comments include `//` and `#` through end of line (other than hex character) and `/* ... */ (* ... *) which may cross lines` -* All rules **MAY** start with an identifier, contained within square brackets. For example `[1] rule`, where the value within the brackets is a symbol `([a-z] | [A-Z] | [0-9] | "_" | ".")+` +* All rules **MAY** start with an number, contained within square brackets. For example `[1] rule`, where the value within the brackets is a symbol `([a-z] | [A-Z] | [0-9] | "_" | ".")+`, which is not retained after parsing +* Symbols **MAY** be enclosed in angle brackets `'<'` and `>`, which are dropped when parsing. * `@terminals` causes following rules to be treated as terminals. Any terminal which is all upper-case (eg`TERMINAL`), or any rules with expressions that match characters (`#xN`, `[a-z]`, `[^a-z]`, `[abc]`, `[^abc]`, `"string"`, `'string'`, or `A - B`), are also treated as terminals. * `@pass` defines the expression used to detect whitespace, which is removed in processing. * No support for `wfc` (well-formedness constraint) or `vc` (validity constraint). @@ -177,7 +177,7 @@ Intermediate representations of the grammar may be serialized to Lisp-like [S-Ex is serialized as - (rule ebnf "1" (star (alt declaration rule))) + (rule ebnf (star (alt declaration rule))) Different components of an EBNF rule expression are transformed into their own operator: diff --git a/VERSION b/VERSION index 437459c..e70b452 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.5.0 +2.6.0 diff --git a/bin/ebnf b/bin/ebnf index b84e069..3018736 100755 --- a/bin/ebnf +++ b/bin/ebnf @@ -9,6 +9,7 @@ $:.unshift(File.expand_path(File.join(File.dirname(__FILE__), "..", 'lib'))) require 'rubygems' require 'getoptlong' require 'ebnf' +require 'rdf/spec' options = { output_format: :sxp, @@ -86,7 +87,11 @@ end input = File.open(ARGV[0]) if ARGV[0] -ebnf = EBNF.parse(input || STDIN, **options) +logger = Logger.new(STDERR) +logger.level = options[:level] || Logger::ERROR +logger.formatter = lambda {|severity, datetime, progname, msg| "%5s %s\n" % [severity, msg]} + +ebnf = EBNF.parse(input || STDIN, logger: logger, **options) ebnf.make_bnf if options[:bnf] || options[:ll1] ebnf.make_peg if options[:peg] if options[:ll1] diff --git a/ebnf.gemspec b/ebnf.gemspec index 049d7e5..2dabea7 100755 --- a/ebnf.gemspec +++ b/ebnf.gemspec @@ -35,6 +35,7 @@ Gem::Specification.new do |gem| gem.add_runtime_dependency 'rdf', '~> 3.3' # Required by sxp gem.add_runtime_dependency 'htmlentities', '~> 4.3' gem.add_runtime_dependency 'unicode-types', '~> 1.8' + gem.add_runtime_dependency 'base64', '~> 0.2' gem.add_development_dependency 'amazing_print', '~> 1.4' gem.add_development_dependency 'rdf-spec', '~> 3.3' gem.add_development_dependency 'rdf-turtle', '~> 3.3' diff --git a/etc/ebnf.ebnf b/etc/ebnf.ebnf index a46cdd8..1f8dfce 100644 --- a/etc/ebnf.ebnf +++ b/etc/ebnf.ebnf @@ -5,9 +5,8 @@ # Use the LHS terminal to match the identifier, rule name and assignment due to # confusion between the identifier and RANGE. - # Note, for grammars not using identifiers, it is still possible to confuse - # a rule ending with a range the next rule, as it may be interpreted as an identifier. - # In such case, best to enclose the rule in '()'. + # The PEG parser has special rules for matching LHS and RANGE + # so that RANGE is not confused with LHS. [3] rule ::= LHS expression [4] expression ::= alt @@ -34,11 +33,13 @@ [11] LHS ::= ('[' SYMBOL ']' ' '+)? SYMBOL ' '* '::=' - [12] SYMBOL ::= ([a-z] | [A-Z] | [0-9] | '_' | '.')+ + [12] SYMBOL ::= '<' O_SYMBOL '>' | O_SYMBOL + + [12a] O_SYMBOL ::= ([a-z] | [A-Z] | [0-9] | '_' | '.')+ [13] HEX ::= '#x' ([a-f] | [A-F] | [0-9])+ - [14] RANGE ::= '[' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']' - LHS + [14] RANGE ::= '[' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']' [15] O_RANGE ::= '[^' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']' diff --git a/etc/ebnf.html b/etc/ebnf.html index 49fa571..a640c5e 100644 --- a/etc/ebnf.html +++ b/etc/ebnf.html @@ -1,4 +1,4 @@ - + @@ -77,6 +77,12 @@ + + + + + + @@ -89,7 +95,7 @@ - + diff --git a/etc/ebnf.ll1.rb b/etc/ebnf.ll1.rb index 2a50869..5a5e0da 100644 --- a/etc/ebnf.ll1.rb +++ b/etc/ebnf.ll1.rb @@ -1,4 +1,4 @@ -# This file is automatically generated by ebnf version 2.4.0 +# This file is automatically generated by ebnf version 2.5.0 # Derived from etc/ebnf.ebnf module Meta START = :ebnf diff --git a/etc/ebnf.ll1.sxp b/etc/ebnf.ll1.sxp index c64ce53..1912786 100644 --- a/etc/ebnf.ll1.sxp +++ b/etc/ebnf.ll1.sxp @@ -100,13 +100,11 @@ (seq '@pass' expression)) (terminals _terminals (seq)) (terminal LHS "11" (seq (opt (seq '[' SYMBOL ']' (plus ' '))) SYMBOL (star ' ') '::=')) - (terminal SYMBOL "12" (plus (alt (range "a-z") (range "A-Z") (range "0-9") '_' '.'))) + (terminal SYMBOL "12" (alt (seq '<' O_SYMBOL '>') O_SYMBOL)) + (terminal O_SYMBOL "12a" (plus (alt (range "a-z") (range "A-Z") (range "0-9") '_' '.'))) (terminal HEX "13" (seq '#x' (plus (alt (range "a-f") (range "A-F") (range "0-9"))))) (terminal RANGE "14" - (seq '[' - (plus (alt (seq R_CHAR '-' R_CHAR) (seq HEX '-' HEX) R_CHAR HEX)) - (opt '-') - (diff ']' LHS)) ) + (seq '[' (plus (alt (seq R_CHAR '-' R_CHAR) (seq HEX '-' HEX) R_CHAR HEX)) (opt '-') ']')) (terminal O_RANGE "15" (seq '[^' (plus (alt (seq R_CHAR '-' R_CHAR) (seq HEX '-' HEX) R_CHAR HEX)) (opt '-') ']')) (terminal STRING1 "16" (seq '"' (star (diff CHAR '"')) '"')) diff --git a/etc/ebnf.peg.rb b/etc/ebnf.peg.rb index 070430d..315e2f6 100644 --- a/etc/ebnf.peg.rb +++ b/etc/ebnf.peg.rb @@ -1,4 +1,4 @@ -# This file is automatically generated by ebnf version 2.4.0 +# This file is automatically generated by ebnf version 2.5.0 # Derived from etc/ebnf.ebnf module EBNFMeta RULES = [ @@ -25,24 +25,25 @@ module EBNFMeta EBNF::Rule.new(:_LHS_3, "11.3", [:seq, "[", :SYMBOL, "]", :_LHS_4], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:_LHS_4, "11.4", [:plus, " "], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:_LHS_2, "11.2", [:star, " "], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:SYMBOL, "12", [:plus, :_SYMBOL_1], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:_SYMBOL_1, "12.1", [:alt, :_SYMBOL_2, :_SYMBOL_3, :_SYMBOL_4, "_", "."], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:_SYMBOL_2, "12.2", [:range, "a-z"], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:_SYMBOL_3, "12.3", [:range, "A-Z"], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:_SYMBOL_4, "12.4", [:range, "0-9"], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:SYMBOL, "12", [:alt, :_SYMBOL_1, :O_SYMBOL], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:_SYMBOL_1, "12.1", [:seq, "<", :O_SYMBOL, ">"], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:O_SYMBOL, "12a", [:plus, :_O_SYMBOL_1], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:_O_SYMBOL_1, "12a.1", [:alt, :_O_SYMBOL_2, :_O_SYMBOL_3, :_O_SYMBOL_4, "_", "."], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:_O_SYMBOL_2, "12a.2", [:range, "a-z"], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:_O_SYMBOL_3, "12a.3", [:range, "A-Z"], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:_O_SYMBOL_4, "12a.4", [:range, "0-9"], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:HEX, "13", [:seq, "#x", :_HEX_1], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:_HEX_1, "13.1", [:plus, :_HEX_2], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:_HEX_2, "13.2", [:alt, :_HEX_3, :_HEX_4, :_HEX_5], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:_HEX_3, "13.3", [:range, "a-f"], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:_HEX_4, "13.4", [:range, "A-F"], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:_HEX_5, "13.5", [:range, "0-9"], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:RANGE, "14", [:seq, "[", :_RANGE_1, :_RANGE_2, :_RANGE_3], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:_RANGE_1, "14.1", [:plus, :_RANGE_4], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:_RANGE_4, "14.4", [:alt, :_RANGE_5, :_RANGE_6, :R_CHAR, :HEX], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:_RANGE_5, "14.5", [:seq, :R_CHAR, "-", :R_CHAR], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:_RANGE_6, "14.6", [:seq, :HEX, "-", :HEX], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:RANGE, "14", [:seq, "[", :_RANGE_1, :_RANGE_2, "]"], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:_RANGE_1, "14.1", [:plus, :_RANGE_3], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:_RANGE_3, "14.3", [:alt, :_RANGE_4, :_RANGE_5, :R_CHAR, :HEX], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:_RANGE_4, "14.4", [:seq, :R_CHAR, "-", :R_CHAR], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:_RANGE_5, "14.5", [:seq, :HEX, "-", :HEX], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:_RANGE_2, "14.2", [:opt, "-"], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:_RANGE_3, "14.3", [:diff, "]", :LHS], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:O_RANGE, "15", [:seq, "[^", :_O_RANGE_1, :_O_RANGE_2, "]"], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:_O_RANGE_1, "15.1", [:plus, :_O_RANGE_3], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:_O_RANGE_3, "15.3", [:alt, :_O_RANGE_4, :_O_RANGE_5, :R_CHAR, :HEX], kind: :terminal).extend(EBNF::PEG::Rule), diff --git a/etc/ebnf.peg.sxp b/etc/ebnf.peg.sxp index 78bd279..6125475 100644 --- a/etc/ebnf.peg.sxp +++ b/etc/ebnf.peg.sxp @@ -22,24 +22,25 @@ (terminal _LHS_3 "11.3" (seq '[' SYMBOL ']' _LHS_4)) (terminal _LHS_4 "11.4" (plus ' ')) (terminal _LHS_2 "11.2" (star ' ')) - (terminal SYMBOL "12" (plus _SYMBOL_1)) - (terminal _SYMBOL_1 "12.1" (alt _SYMBOL_2 _SYMBOL_3 _SYMBOL_4 '_' '.')) - (terminal _SYMBOL_2 "12.2" (range "a-z")) - (terminal _SYMBOL_3 "12.3" (range "A-Z")) - (terminal _SYMBOL_4 "12.4" (range "0-9")) + (terminal SYMBOL "12" (alt _SYMBOL_1 O_SYMBOL)) + (terminal _SYMBOL_1 "12.1" (seq '<' O_SYMBOL '>')) + (terminal O_SYMBOL "12a" (plus _O_SYMBOL_1)) + (terminal _O_SYMBOL_1 "12a.1" (alt _O_SYMBOL_2 _O_SYMBOL_3 _O_SYMBOL_4 '_' '.')) + (terminal _O_SYMBOL_2 "12a.2" (range "a-z")) + (terminal _O_SYMBOL_3 "12a.3" (range "A-Z")) + (terminal _O_SYMBOL_4 "12a.4" (range "0-9")) (terminal HEX "13" (seq '#x' _HEX_1)) (terminal _HEX_1 "13.1" (plus _HEX_2)) (terminal _HEX_2 "13.2" (alt _HEX_3 _HEX_4 _HEX_5)) (terminal _HEX_3 "13.3" (range "a-f")) (terminal _HEX_4 "13.4" (range "A-F")) (terminal _HEX_5 "13.5" (range "0-9")) - (terminal RANGE "14" (seq '[' _RANGE_1 _RANGE_2 _RANGE_3)) - (terminal _RANGE_1 "14.1" (plus _RANGE_4)) - (terminal _RANGE_4 "14.4" (alt _RANGE_5 _RANGE_6 R_CHAR HEX)) - (terminal _RANGE_5 "14.5" (seq R_CHAR '-' R_CHAR)) - (terminal _RANGE_6 "14.6" (seq HEX '-' HEX)) + (terminal RANGE "14" (seq '[' _RANGE_1 _RANGE_2 ']')) + (terminal _RANGE_1 "14.1" (plus _RANGE_3)) + (terminal _RANGE_3 "14.3" (alt _RANGE_4 _RANGE_5 R_CHAR HEX)) + (terminal _RANGE_4 "14.4" (seq R_CHAR '-' R_CHAR)) + (terminal _RANGE_5 "14.5" (seq HEX '-' HEX)) (terminal _RANGE_2 "14.2" (opt '-')) - (terminal _RANGE_3 "14.3" (diff ']' LHS)) (terminal O_RANGE "15" (seq '[^' _O_RANGE_1 _O_RANGE_2 ']')) (terminal _O_RANGE_1 "15.1" (plus _O_RANGE_3)) (terminal _O_RANGE_3 "15.3" (alt _O_RANGE_4 _O_RANGE_5 R_CHAR HEX)) diff --git a/etc/ebnf.sxp b/etc/ebnf.sxp index 028b01f..435c333 100644 --- a/etc/ebnf.sxp +++ b/etc/ebnf.sxp @@ -12,13 +12,11 @@ (rule pass "10" (seq '@pass' expression)) (terminals _terminals (seq)) (terminal LHS "11" (seq (opt (seq '[' SYMBOL ']' (plus ' '))) SYMBOL (star ' ') '::=')) - (terminal SYMBOL "12" (plus (alt (range "a-z") (range "A-Z") (range "0-9") '_' '.'))) + (terminal SYMBOL "12" (alt (seq '<' O_SYMBOL '>') O_SYMBOL)) + (terminal O_SYMBOL "12a" (plus (alt (range "a-z") (range "A-Z") (range "0-9") '_' '.'))) (terminal HEX "13" (seq '#x' (plus (alt (range "a-f") (range "A-F") (range "0-9"))))) (terminal RANGE "14" - (seq '[' - (plus (alt (seq R_CHAR '-' R_CHAR) (seq HEX '-' HEX) R_CHAR HEX)) - (opt '-') - (diff ']' LHS)) ) + (seq '[' (plus (alt (seq R_CHAR '-' R_CHAR) (seq HEX '-' HEX) R_CHAR HEX)) (opt '-') ']')) (terminal O_RANGE "15" (seq '[^' (plus (alt (seq R_CHAR '-' R_CHAR) (seq HEX '-' HEX) R_CHAR HEX)) (opt '-') ']')) (terminal STRING1 "16" (seq '"' (star (diff CHAR '"')) '"')) diff --git a/etc/iso-ebnf.isoebnf b/etc/iso-ebnf.isoebnf index 96c940f..8cd2cb4 100644 --- a/etc/iso-ebnf.isoebnf +++ b/etc/iso-ebnf.isoebnf @@ -1,4 +1,3 @@ -(* W3C EBNF for ISO/IEC 14977 : 1996 EBNF *) (* Scoured from https://www.cl.cam.ac.uk/~mgk25/iso-14977.pdf *) syntax = syntax_rule, {syntax_rule} ; @@ -44,10 +43,10 @@ repeated_sequence = start_repeat_symbol, definitions_list, end_repeat_symbol grouped_sequence = '(', definitions_list, ')' (* The brackets ( and ) allow any to be a *); -terminal_string = ("'", first_terminal_character, {first_terminal_character}, "'") - | ('"', second_terminal_character, {second_terminal_character}, '"') - (* A represents the - between the quote symbols '_' or "_" *); +terminal_string = ("'", first_terminal_character, {first_terminal_character}, "'") + | ('"', second_terminal_character, {second_terminal_character}, '"') + (* A represents the + between the quote symbols '_' or "_" *); meta_identifier = letter, {meta_identifier_character} (* A is the name of a syntactic element of the language being defined *); diff --git a/examples/JSON/JSON-ab.ebnf b/examples/JSON/JSON-ab.ebnf new file mode 100644 index 0000000..da8c74f --- /dev/null +++ b/examples/JSON/JSON-ab.ebnf @@ -0,0 +1,36 @@ + ::= ( | )+ + + ::= ( '"' | "/" | "b" | "f" | "n" | "r" | "t" | | ) + ::= | "`" + ::= | | " " | "!" | "#" | "$" | "%" | "&" | "'" | "(" | ")" | "*+" | "," | "-" | "." | "/" | ":" | ";" | "<" | ">" | "?" | "@" | "[" | "]" | "^" | "_" | "`" | "{" | "|" | "}" | "~" + ::= | | " " | "!" | "#" | "$" | "%" | "&" | "'" | "(" | ")" | "*+" | "," | "-" | "." | "/" | ":" | ";" | "<" | ">" | "?" | "@" | "[" | "]" | "^" | "_" | "{" | "|" | "}" | "~" + + ::= "u" + + ::= "\" + ::= [0-9] + ::= [A-Z] | [a-z] | "_" + ::= '"' + +/* The ``json-value`` is any valid JSON value with the one exception that the */ +/* ``%x60`` character must be escaped. While it's encouraged that implementations */ +/* use any existing JSON parser for this grammar rule (after handling the escaped */ +/* literal characters), the grammar rule is shown below for completeness:: */ + + ::= + | + | + | + | + | + + ::= "null" + ::= "true" | "false" + ::= "-"? ( "0" | [1-9] [0-9]* ) ( "." [0-9]+ )? ( "e" ( "-" | "+" ) [0-9]+ )? + ::= "[" ( ( "," )* )? "]" + ::= "{" ( ( "," )* )? "}" + ::= ( | )* + + ::= ":" + ::= " "* + \ No newline at end of file diff --git a/examples/JSON/JSON.ebnf b/examples/JSON/JSON.ebnf new file mode 100644 index 0000000..318f5b6 --- /dev/null +++ b/examples/JSON/JSON.ebnf @@ -0,0 +1,38 @@ +quoted_string ::= quote ( unescaped_char | escaped_char )+ quote + +escaped_char ::= escape ( '"' | "/" | "b" | "f" | "n" | "r" | "t" | unicode | escape ) +escaped_literal ::= escaped_char | escape "`" +unescaped_char ::= digit | letter | " " | "!" | "#" | "$" | "%" | "&" | "(" | ")" | "*+" | "," | "-" | "." | "/" | ":" | ";" | "<" | ">" | "?" | "@" | "[" | "]" | "^" | "_" | "`" | "{" | "|" | "}" | "~" +unescaped_literal ::= digit | letter | " " | "!" | "#" | "$" | "%" | "&" | "'" | "(" | ")" | "*+" | "," | "-" | "." | "/" | ":" | ";" | "<" | ">" | "?" | "@" | "[" | "]" | "^" | "_" | "{" | "|" | "}" | "~" + +unicode ::= "u" digit digit digit digit + +escape ::= "\" + +digit ::= [0-9] + +letter ::= [A-Z] | [a-z] | "_" +quote ::= '"' + +/* The ``json-value`` is any valid JSON value with the one exception that the */ +/* ``%x60`` character must be escaped. While it's encouraged that implementations */ +/* use any existing JSON parser for this grammar rule (after handling the escaped */ +/* literal characters), the grammar rule is shown below for completeness:: */ + + json_value ::= json_array + | json_boolean + | json_null + | json_number + | json_object + | json_string + + json_null ::= "null" + json_boolean ::= "true" | "false" + json_number ::= "-"? ( "0" | [1-9] [0-9]* ) ( "." [0-9]+ )? ( "e" ( "-" | "+" ) [0-9]+ )? + json_array ::= ws "[" ( ws json_value ws ( "," ws json_value ws )* )? "]" ws + json_object ::= ws "{" ws ( member ws ( "," ws member ws )* )? "}" ws + json_string ::= quote ( unescaped_literal | escaped_literal )* quote + + member ::= quoted_string ws ":" ws json_value + ws ::= " "* + \ No newline at end of file diff --git a/examples/abnf/parser.rb b/examples/abnf/parser.rb index 94c10fb..fc5a18b 100644 --- a/examples/abnf/parser.rb +++ b/examples/abnf/parser.rb @@ -238,10 +238,10 @@ class ABNFParser # @return [EBNFParser] def initialize(input, **options, &block) # If the `level` option is set, instantiate a logger for collecting trace information. - if options.has_key?(:level) - options[:logger] = Logger.new(STDERR) - options[:logger].level = options[:level] - options[:logger].formatter = lambda {|severity, datetime, progname, msg| "#{severity} #{msg}\n"} + if options.key?(:level) + options[:logger] ||= Logger.new(STDERR). + tap {|x| x.level = options[:level]}. + tap {|x| x.formatter = lambda {|severity, datetime, progname, msg| "#{severity} #{msg}\n"}} end # Read input, if necessary, which will be used in a Scanner. diff --git a/examples/ebnf-ll1-parser/parser.rb b/examples/ebnf-ll1-parser/parser.rb index 3b1736c..5dc09b0 100644 --- a/examples/ebnf-ll1-parser/parser.rb +++ b/examples/ebnf-ll1-parser/parser.rb @@ -276,10 +276,10 @@ def initialize(input, **options, &block) @input = input.respond_to?(:read) ? input.read : input.to_s # If the `level` option is set, instantiate a logger for collecting trace information. - if options.has_key?(:level) - options[:logger] = Logger.new(STDERR) - options[:logger].level = options.fetch(:level, 2) - options[:logger].formatter = lambda {|severity, datetime, progname, msg| "#{severity} #{msg}\n"} + if options.key?(:level) + options[:logger] ||= Logger.new(STDERR). + tap {|x| x.level = options[:level]}. + tap {|x| x.formatter = lambda {|severity, datetime, progname, msg| "#{severity} #{msg}\n"}} end parsing_terminals = false diff --git a/examples/ebnf-peg-parser/parser.rb b/examples/ebnf-peg-parser/parser.rb index f8127a8..d56dc54 100644 --- a/examples/ebnf-peg-parser/parser.rb +++ b/examples/ebnf-peg-parser/parser.rb @@ -272,10 +272,10 @@ class EBNFPegParser # @return [EBNFParser] def initialize(input, **options, &block) # If the `level` option is set, instantiate a logger for collecting trace information. - if options.has_key?(:level) - options[:logger] = Logger.new(STDERR) - options[:logger].level = options[:level] - options[:logger].formatter = lambda {|severity, datetime, progname, msg| "#{severity} #{msg}\n"} + if options.key?(:level) + options[:logger] ||= Logger.new(STDERR). + tap {|x| x.level = options[:level]}. + tap {|x| x.formatter = lambda {|severity, datetime, progname, msg| "#{severity} #{msg}\n"}} end # Read input, if necessary, which will be used in a Scanner. diff --git a/examples/isoebnf/parser.rb b/examples/isoebnf/parser.rb index 4e12086..65d08b2 100644 --- a/examples/isoebnf/parser.rb +++ b/examples/isoebnf/parser.rb @@ -200,10 +200,10 @@ class ISOEBNFPegParser # @return [EBNFParser] def initialize(input, **options, &block) # If the `level` option is set, instantiate a logger for collecting trace information. - if options.has_key?(:level) - options[:logger] = Logger.new(STDERR) - options[:logger].level = options[:level] - options[:logger].formatter = lambda {|severity, datetime, progname, msg| "#{severity} #{msg}\n"} + if options.key?(:level) + options[:logger] ||= Logger.new(STDERR). + tap {|x| x.level = options[:level]}. + tap {|x| x.formatter = lambda {|severity, datetime, progname, msg| "#{severity} #{msg}\n"}} end # Read input, if necessary, which will be used in a Scanner. diff --git a/lib/ebnf/abnf.rb b/lib/ebnf/abnf.rb index de79815..ba7fd12 100644 --- a/lib/ebnf/abnf.rb +++ b/lib/ebnf/abnf.rb @@ -234,10 +234,10 @@ class ABNF # @return [EBNFParser] def initialize(input, **options) # If the `level` option is set, instantiate a logger for collecting trace information. - if options.has_key?(:level) - options[:logger] = Logger.new(STDERR) - options[:logger].level = options[:level] - options[:logger].formatter = lambda {|severity, datetime, progname, msg| "#{severity} #{msg}\n"} + if options.key?(:level) + options[:logger] ||= Logger.new(STDERR). + tap {|x| x.level = options[:level]}. + tap {|x| x.formatter = lambda {|severity, datetime, progname, msg| "#{severity} #{msg}\n"}} end # Read input, if necessary, which will be used in a Scanner. diff --git a/lib/ebnf/base.rb b/lib/ebnf/base.rb index 25fc53b..a5de1f6 100644 --- a/lib/ebnf/base.rb +++ b/lib/ebnf/base.rb @@ -106,8 +106,8 @@ class Base # Format of input, one of `:abnf`, `:ebnf`, `:isoebnf`, `:isoebnf`, `:native`, or `:sxp`. # Use `:native` for the native EBNF parser, rather than the PEG parser. # @param [Hash{Symbol => Object}] options - # @option options [Boolean, Array] :debug - # Output debug information to an array or $stdout. + # @option options [Boolean] :level + # Trace level. 0(debug), 1(info), 2(warn), 3(error). # @option options [Boolean, Array] :validate # Validate resulting grammar. def initialize(input, format: :ebnf, **options) @@ -311,13 +311,7 @@ def depth # Progress output, less than debugging def progress(*args, **options) - return unless @options[:progress] || @options[:debug] - depth = options[:depth] || @depth - args << yield if block_given? - message = "#{args.join(': ')}" - str = "[#{@lineno}]#{' ' * depth}#{message}" - @options[:debug] << str if @options[:debug].is_a?(Array) - $stderr.puts(str) if @options[:progress] || @options[:debug] == true + debug(*args, level: Logger::INFO, **options) end # Error output @@ -325,10 +319,9 @@ def error(*args, **options) depth = options[:depth] || @depth args << yield if block_given? message = "#{args.join(': ')}" + debug(message, level: Logger::ERROR, **options) @errors << message - str = "[#{@lineno}]#{' ' * depth}#{message}" - @options[:debug] << str if @options[:debug].is_a?(Array) - $stderr.puts(str) + $stderr.puts(message) end ## @@ -342,14 +335,17 @@ def error(*args, **options) # @param [String] message ("") # # @yieldreturn [String] added to message - def debug(*args, **options) - return unless @options[:debug] + def debug(*args, level: Logger::DEBUG, **options) + return unless @options.key?(:logger) depth = options[:depth] || @depth args << yield if block_given? message = "#{args.join(': ')}" str = "[#{@lineno}]#{' ' * depth}#{message}" - @options[:debug] << str if @options[:debug].is_a?(Array) - $stderr.puts(str) if @options[:debug] == true + if @options[:logger].respond_to?(:add) + @options[:logger].add(level, str) + elsif @options[:logger].respond_to?(:<<) + @options[:logger] << "[#{lineno}] " + str + end end end end \ No newline at end of file diff --git a/lib/ebnf/ebnf/meta.rb b/lib/ebnf/ebnf/meta.rb index 7492826..c319b09 100644 --- a/lib/ebnf/ebnf/meta.rb +++ b/lib/ebnf/ebnf/meta.rb @@ -1,4 +1,4 @@ -# This file is automatically generated by ebnf version 2.0.0 +# This file is automatically generated by ebnf version 2.5.0 # Derived from etc/ebnf.ebnf module EBNFMeta RULES = [ @@ -25,11 +25,13 @@ module EBNFMeta EBNF::Rule.new(:_LHS_3, "11.3", [:seq, "[", :SYMBOL, "]", :_LHS_4], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:_LHS_4, "11.4", [:plus, " "], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:_LHS_2, "11.2", [:star, " "], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:SYMBOL, "12", [:plus, :_SYMBOL_1], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:_SYMBOL_1, "12.1", [:alt, :_SYMBOL_2, :_SYMBOL_3, :_SYMBOL_4, "_", "."], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:_SYMBOL_2, "12.2", [:range, "a-z"], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:_SYMBOL_3, "12.3", [:range, "A-Z"], kind: :terminal).extend(EBNF::PEG::Rule), - EBNF::Rule.new(:_SYMBOL_4, "12.4", [:range, "0-9"], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:SYMBOL, "12", [:alt, :_SYMBOL_1, :O_SYMBOL], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:_SYMBOL_1, "12.1", [:seq, "<", :O_SYMBOL, ">"], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:O_SYMBOL, "12a", [:plus, :_O_SYMBOL_1], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:_O_SYMBOL_1, "12a.1", [:alt, :_O_SYMBOL_2, :_O_SYMBOL_3, :_O_SYMBOL_4, "_", "."], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:_O_SYMBOL_2, "12a.2", [:range, "a-z"], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:_O_SYMBOL_3, "12a.3", [:range, "A-Z"], kind: :terminal).extend(EBNF::PEG::Rule), + EBNF::Rule.new(:_O_SYMBOL_4, "12a.4", [:range, "0-9"], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:HEX, "13", [:seq, "#x", :_HEX_1], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:_HEX_1, "13.1", [:plus, :_HEX_2], kind: :terminal).extend(EBNF::PEG::Rule), EBNF::Rule.new(:_HEX_2, "13.2", [:alt, :_HEX_3, :_HEX_4, :_HEX_5], kind: :terminal).extend(EBNF::PEG::Rule), diff --git a/lib/ebnf/isoebnf.rb b/lib/ebnf/isoebnf.rb index ba585da..80a60a3 100644 --- a/lib/ebnf/isoebnf.rb +++ b/lib/ebnf/isoebnf.rb @@ -196,10 +196,10 @@ class ISOEBNF # @return [EBNFParser] def initialize(input, **options, &block) # If the `level` option is set, instantiate a logger for collecting trace information. - if options.has_key?(:level) - options[:logger] = Logger.new(STDERR) - options[:logger].level = options[:level] - options[:logger].formatter = lambda {|severity, datetime, progname, msg| "#{severity} #{msg}\n"} + if options.key?(:level) + options[:logger] ||= Logger.new(STDERR). + tap {|x| x.level = options[:level]}. + tap {|x| x.formatter = lambda {|severity, datetime, progname, msg| "#{severity} #{msg}\n"}} end # Read input, if necessary, which will be used in a Scanner. diff --git a/lib/ebnf/ll1/parser.rb b/lib/ebnf/ll1/parser.rb index e8f5159..af0104e 100644 --- a/lib/ebnf/ll1/parser.rb +++ b/lib/ebnf/ll1/parser.rb @@ -603,7 +603,7 @@ def onStart(prod) if handler # Create a new production data element, potentially allowing handler # to customize before pushing on the @prod_data stack - debug("#{prod}(:start):#{@prod_data.length}") {@prod_data.last} + progress("#{prod}(:start):#{@prod_data.length}") {@prod_data.last} data = {} begin self.class.eval_with_binding(self) { @@ -617,12 +617,12 @@ def onStart(prod) elsif [:merge, :star].include?(@cleanup[prod]) # Save current data to merge later @prod_data << {} - debug("#{prod}(:start}:#{@prod_data.length}:cleanup:#{@cleanup[prod]}") { get_token.inspect + (@recovering ? ' recovering' : '')} + progress("#{prod}(:start}:#{@prod_data.length}:cleanup:#{@cleanup[prod]}") { get_token.inspect + (@recovering ? ' recovering' : '')} else # Make sure we push as many was we pop, even if there is no # explicit start handler @prod_data << {} if self.class.production_handlers[prod] - debug("#{prod}(:start:#{@prod_data.length})") { get_token.inspect + (@recovering ? ' recovering' : '')} + progress("#{prod}(:start:#{@prod_data.length})") { get_token.inspect + (@recovering ? ' recovering' : '')} end #puts "prod_data(s): " + @prod_data.inspect end diff --git a/lib/ebnf/native.rb b/lib/ebnf/native.rb index 512cef2..f0b97e2 100644 --- a/lib/ebnf/native.rb +++ b/lib/ebnf/native.rb @@ -52,7 +52,7 @@ def eachRule(scanner) yield r unless r.empty? #debug("eachRule(rule)") { "[#{cur_lineno}] #{s.inspect}" } @lineno = cur_lineno - r = s + r = s.gsub(/[<>]/, '') # Remove angle brackets else # Collect until end of line, or start of comment or quote s = scanner.scan_until(%r{(?:[/\(]\*)|#(?!x)|//|["']|$}) @@ -81,6 +81,7 @@ def ruleParts(rule) num, sym = num_sym.split(']', 2).map(&:strip) num, sym = "", num if sym.nil? num = num[1..-1] + sym = sym[1..-2] if sym.start_with?('<') && sym.end_with?('>') r = Rule.new(sym && sym.to_sym, num, expression(expr).first, ebnf: self) debug("ruleParts") { r.inspect } r @@ -226,7 +227,7 @@ def diff(s) # (a ' b c') # # >>> postfix("a? b c") - # ((opt, a) ' b c') + # ((opt a) ' b c') def postfix(s) debug("postfix") {"(#{s.inspect})"} e, s = depth {primary(s)} @@ -297,8 +298,8 @@ def terminal(s) s.match(/(#x\h+)(.*)$/) l, s = $1, $2 [[:hex, l], s] - when /[\w\.]/ # SYMBOL - s.match(/([\w\.]+)(.*)$/) + when '<', /[\w\.]/ # SYMBOL + s.match(/?(.*)$/) l, s = $1, $2 [l.to_sym, s] when '-' diff --git a/lib/ebnf/parser.rb b/lib/ebnf/parser.rb index 54bca9f..0dc4d68 100644 --- a/lib/ebnf/parser.rb +++ b/lib/ebnf/parser.rb @@ -11,6 +11,12 @@ class Parser # @return [Array] attr_reader :ast + # Set on first rule + attr_reader :lhs_includes_identifier + + # Regular expression to match a [...] range, which may be distinguisehd from an LHS + attr_reader :range + # ## Terminals # Define rules for Terminals, placing results on the input stack, making them available to upstream non-Terminal rules. # @@ -26,15 +32,32 @@ class Parser # Match the Left hand side of a rule or terminal # - # [11] LHS ::= ('[' SYMBOL+ ']' ' '+)? SYMBOL ' '* '::=' + # [11] LHS ::= ('[' SYMBOL+ ']' ' '+)? ? ' '* '::=' terminal(:LHS, LHS) do |value, prod| - value.to_s.scan(/(?:\[([^\]]+)\])?\s*(\w+)\s*::=/).first + md = value.to_s.scan(/(?:\[([^\]]+)\])?\s*?\s*::=/).first + if @lhs_includes_identifier.nil? + @lhs_includes_identifier = !md[0].nil? + @range = md[0] ? RANGE_NOT_LHS : RANGE + elsif @lhs_includes_identifier && !md[0] + error("LHS", + "Rule does not begin with a [xxx] identifier, which was established on the first rule", + production: :LHS, + rest: value) + elsif !@lhs_includes_identifier && md[0] + error("LHS", + "Rule begins with a [xxx] identifier, which was not established on the first rule", + production: :LHS, + rest: value) + end + md end # Match `SYMBOL` terminal # - # [12] SYMBOL ::= ([a-z] | [A-Z] | [0-9] | '_' | '.')+ + # [12] SYMBOL ::= '<' O_SYMBOL '>' | O_SYMBOL + # [12a] O_SYMBOL ::= ([a-z] | [A-Z] | [0-9] | '_' | '.')+ terminal(:SYMBOL, SYMBOL) do |value| + value = value[1..-2] if value.start_with?('<') && value.end_with?('>') value.to_sym end @@ -46,9 +69,10 @@ class Parser end # Terminal for `RANGE` is matched as part of a `primary` rule. + # Note that this won't match if rules include identifiers. # - # [14] RANGE ::= '[' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']' - LHS - terminal(:RANGE, RANGE) do |value| + # [14] RANGE ::= '[' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']' + terminal(:RANGE, proc {@range}) do |value| [:range, value[1..-2]] end @@ -128,7 +152,9 @@ class Parser # Invoke callback id, sym = value[:LHS] expression = value[:expression] - callback.call(:rule, EBNF::Rule.new(sym.to_sym, id, expression)) + rule = EBNF::Rule.new(sym.to_sym, id, expression) + progress(:rule, rule.to_sxp) + callback.call(:rule, rule) nil end @@ -266,12 +292,15 @@ class Parser # @return [EBNFParser] def initialize(input, **options, &block) # If the `level` option is set, instantiate a logger for collecting trace information. - if options.has_key?(:level) - options[:logger] = Logger.new(STDERR) - options[:logger].level = options[:level] - options[:logger].formatter = lambda {|severity, datetime, progname, msg| "#{severity} #{msg}\n"} + if options.key?(:level) + options[:logger] ||= Logger.new(STDERR). + tap {|x| x.level = options[:level]}. + tap {|x| x.formatter = lambda {|severity, datetime, progname, msg| "#{severity} #{msg}\n"}} end + # This is established on the first rule. + self.class.instance_variable_set(:@lhs_includes_identifier, nil) + # Read input, if necessary, which will be used in a Scanner. @input = input.respond_to?(:read) ? input.read : input.to_s diff --git a/lib/ebnf/peg/parser.rb b/lib/ebnf/peg/parser.rb index 79a7270..9fe39ed 100644 --- a/lib/ebnf/peg/parser.rb +++ b/lib/ebnf/peg/parser.rb @@ -68,10 +68,9 @@ def terminal_options; (@terminal_options ||= {}); end # # @param [Symbol] term # The terminal name. - # @param [Regexp] regexp (nil) - # Pattern used to scan for this terminal, - # defaults to the expression defined in the associated rule. - # If unset, the terminal rule is used for matching. + # @param [Regexp, Proc] regexp + # Pattern used to scan for this terminal. + # Passing a Proc will evaluate that proc to retrieve a regular expression. # @param [Hash] options # @option options [Boolean] :unescape # Cause strings and codepoints to be unescaped. @@ -83,8 +82,8 @@ def terminal_options; (@terminal_options ||= {}); end # @yieldparam [Proc] block # Block passed to initialization for yielding to calling parser. # Should conform to the yield specs for #initialize - def terminal(term, regexp = nil, **options, &block) - terminal_regexps[term] = regexp if regexp + def terminal(term, regexp, **options, &block) + terminal_regexps[term] = regexp terminal_handlers[term] = block if block_given? terminal_options[term] = options.freeze end @@ -138,6 +137,8 @@ def start_production(term, **options, &block) # @yieldparam [Proc] block # Block passed to initialization for yielding to calling parser. # Should conform to the yield specs for #initialize + # @yieldparam [Hash] **options + # Other data that may be passed to the production # @yieldreturn [Object] the result of this production. # Yield to generate a triple def production(term, clear_packrat: false, &block) @@ -183,6 +184,8 @@ def method_missing(method, *args, &block) # Identify the symbol of the starting rule with `start`. # @param [Hash{Symbol => Object}] options # @option options[Integer] :high_water passed to lexer + # @option options[:upper, :lower] :insensitive_strings + # Perform case-insensitive match of strings not defined as terminals, and map to either upper or lower case. # @option options [Logger] :logger for errors/progress/debug. # @option options[Integer] :low_water passed to lexer # @option options[Boolean] :seq_hash (false) @@ -201,7 +204,7 @@ def method_missing(method, *args, &block) # or errors raised during processing callbacks. Internal # errors are raised using {Error}. # @todo FIXME implement seq_hash - def parse(input = nil, start = nil, rules = nil, **options, &block) + def parse(input = nil, start = nil, rules = nil, insensitive_strings: nil, **options, &block) start ||= options[:start] rules ||= options[:rules] || [] @rules = rules.inject({}) {|memo, rule| memo.merge(rule.sym => rule)} @@ -230,7 +233,7 @@ def parse(input = nil, start = nil, rules = nil, **options, &block) start_rule = @rules[start] raise Error, "Starting production #{start.inspect} not defined" unless start_rule - result = start_rule.parse(scanner) + result = start_rule.parse(scanner, insensitive_strings: insensitive_strings) if result == :unmatched # Start rule wasn't matched, which is about the only error condition error("--top--", @furthest_failure.to_s, @@ -367,21 +370,17 @@ def debug(*args, &block) # Start for production # Adds data avoiable during the processing of the production # + # @param [Symbol] prod + # @param [Hash] **options other options available for handlers # @return [Hash] composed of production options. Currently only `as_hash` is supported. # @see ClassMethods#start_production - def onStart(prod) + def onStart(prod, **options) handler = self.class.start_handlers[prod] @productions << prod - debug("#{prod}(:start)", "", - lineno: (scanner.lineno if scanner), - pos: (scanner.pos if scanner) - ) do - "#{prod}, pos: #{scanner ? scanner.pos : '?'}, rest: #{scanner ? scanner.rest[0..20].inspect : '?'}" - end if handler # Create a new production data element, potentially allowing handler # to customize before pushing on the @prod_data stack - data = {_production: prod} + data = {_production: prod}.merge(options) begin self.class.eval_with_binding(self) { handler.call(data, @parse_callback) @@ -396,14 +395,21 @@ def onStart(prod) # explicit start handler @prod_data << {_production: prod} end + progress("#{prod}(:start)", "", + lineno: (scanner.lineno if scanner), + pos: (scanner.pos if scanner) + ) do + "#{data.inspect}@(#{scanner ? scanner.pos : '?'}), rest: #{scanner ? scanner.rest[0..20].inspect : '?'}" + end return self.class.start_options.fetch(prod, {}) # any options on this production end # Finish of production # # @param [Object] result parse result + # @param [Hash] **options other options available for handlers # @return [Object] parse result, or the value returned from the handler - def onFinish(result) + def onFinish(result, **options) #puts "prod_data(f): " + @prod_data.inspect prod = @productions.last handler, clear_packrat = self.class.production_handlers[prod] @@ -415,14 +421,14 @@ def onFinish(result) # Pop production data element from stack, potentially allowing handler to use it result = begin self.class.eval_with_binding(self) { - handler.call(result, data, @parse_callback) + handler.call(result, data, @parse_callback, **options) } rescue ArgumentError, Error => e error("finish", "#{e.class}: #{e.message}", production: prod, backtrace: e.backtrace) @recovering = false end end - debug("#{prod}(:finish)", "", + progress("#{prod}(:finish)", "", lineno: (scanner.lineno if scanner), level: result == :unmatched ? 0 : 1) do "#{result.inspect}@(#{scanner ? scanner.pos : '?'}), rest: #{scanner ? scanner.rest[0..20].inspect : '?'}" @@ -572,5 +578,5 @@ def initialize(message, **options) super(message.to_s) end end # class Error - end # class Parser -end # module EBNF::LL1 + end # module Parser +end # module EBNF::PEG diff --git a/lib/ebnf/peg/rule.rb b/lib/ebnf/peg/rule.rb index 305543a..4073de9 100644 --- a/lib/ebnf/peg/rule.rb +++ b/lib/ebnf/peg/rule.rb @@ -13,7 +13,7 @@ module Rule ## # Parse a rule or terminal, invoking callbacks, as appropriate - # If there is are `start_production` and/or `production`, + # If there are `start_production` and/or `production` handlers, # they are invoked with a `prod_data` stack, the input stream and offset. # Otherwise, the results are added as an array value # to a hash indexed by the rule name. @@ -31,8 +31,9 @@ module Rule # * `star`: returns an array of the values matched for the specified production. For Terminals, these are concatenated into a single string. # # @param [Scanner] input + # @param [Hash] **options Other data that may be passed to handlers. # @return [Hash{Symbol => Object}, :unmatched] A hash with keys for matched component of the expression. Returns :unmatched if the input does not match the production. - def parse(input) + def parse(input, **options) # Save position and linenumber for backtracking pos, lineno = input.pos, input.lineno @@ -48,6 +49,7 @@ def parse(input) # use that to match the input, # otherwise, if regexp = parser.terminal_regexp(sym) + regexp = regexp.call() if regexp.is_a?(Proc) term_opts = parser.terminal_options(sym) if matched = input.scan(regexp) # Optionally map matched @@ -71,12 +73,12 @@ def parse(input) else eat_whitespace(input) end - start_options = parser.onStart(sym) + start_options = options.merge(parser.onStart(sym, **options)) string_regexp_opts = start_options[:insensitive_strings] ? Regexp::IGNORECASE : 0 result = case expr.first when :alt - # Return the first expression to match. + # Return the first expression to match. Look at strings before terminals before non-terminals, with strings ordered by longest first # Result is either :unmatched, or the value of the matching rule alt = :unmatched expr[1..-1].each do |prod| @@ -84,14 +86,19 @@ def parse(input) when Symbol rule = parser.find_rule(prod) raise "No rule found for #{prod}" unless rule - rule.parse(input) + rule.parse(input, **options) when String - s = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts)) - case start_options[:insensitive_strings] - when :lower then s && s.downcase - when :upper then s && s.upcase - else s - end || :unmatched + # If the input matches a terminal for which the string is a prefix, don't match the string + if terminal_also_matches(input, prod, string_regexp_opts) + :unmatched + else + s = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts)) + case start_options[:insensitive_strings] + when :lower then s && s.downcase + when :upper then s && s.upcase + else s + end || :unmatched + end end if alt == :unmatched # Update furthest failure for strings and terminals @@ -127,9 +134,18 @@ def parse(input) when Symbol rule = parser.find_rule(prod) raise "No rule found for #{prod}" unless rule - rule.parse(input) + rule.parse(input, **options) when String - input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts)) || :unmatched + if terminal_also_matches(input, prod, string_regexp_opts) + :unmatched + else + s = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts)) + case start_options[:insensitive_strings] + when :lower then s && s.downcase + when :upper then s && s.upcase + else s + end || :unmatched + end end if res != :unmatched # Update furthest failure for terminals @@ -148,7 +164,7 @@ def parse(input) when :plus # Result is an array of all expressions while they match, # at least one must match - plus = rept(input, 1, '*', expr[1], string_regexp_opts) + plus = rept(input, 1, '*', expr[1], string_regexp_opts, **options) # Update furthest failure for strings and terminals parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal? @@ -163,7 +179,7 @@ def parse(input) when :rept # Result is an array of all expressions while they match, # an empty array of none match - rept = rept(input, expr[1], expr[2], expr[3], string_regexp_opts) + rept = rept(input, expr[1], expr[2], expr[3], string_regexp_opts, **options) # # Update furthest failure for strings and terminals parser.update_furthest_failure(input.pos, input.lineno, expr[3]) if terminal? @@ -176,14 +192,18 @@ def parse(input) when Symbol rule = parser.find_rule(prod) raise "No rule found for #{prod}" unless rule - rule.parse(input) + rule.parse(input, **options.merge(_rept_data: accumulator)) when String - s = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts)) - case start_options[:insensitive_strings] - when :lower then s && s.downcase - when :upper then s && s.upcase - else s - end || :unmatched + if terminal_also_matches(input, prod, string_regexp_opts) + :unmatched + else + s = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts)) + case start_options[:insensitive_strings] + when :lower then s && s.downcase + when :upper then s && s.upcase + else s + end || :unmatched + end end if res == :unmatched # Update furthest failure for strings and terminals @@ -204,7 +224,7 @@ def parse(input) when :star # Result is an array of all expressions while they match, # an empty array of none match - star = rept(input, 0, '*', expr[1], string_regexp_opts) + star = rept(input, 0, '*', expr[1], string_regexp_opts, **options) # Update furthest failure for strings and terminals parser.update_furthest_failure(input.pos, input.lineno, expr[1]) if terminal? @@ -214,10 +234,11 @@ def parse(input) end if result == :unmatched + # Rewind input to entry point if unmatched. input.pos, input.lineno = pos, lineno end - result = parser.onFinish(result) + result = parser.onFinish(result, **options) (parser.packrat[sym] ||= {})[pos] = { pos: input.pos, lineno: input.lineno, @@ -229,7 +250,8 @@ def parse(input) ## # Repitition, 0-1, 0-n, 1-n, ... # - # Note, nil results are removed from the result, but count towards min/max calculations + # Note, nil results are removed from the result, but count towards min/max calculations. + # Saves temporary production data to prod_data stack. # # @param [Scanner] input # @param [Integer] min @@ -245,11 +267,12 @@ def rept(input, min, max, prod, string_regexp_opts, **options) when Symbol rule = parser.find_rule(prod) raise "No rule found for #{prod}" unless rule - while (max == '*' || result.length < max) && (res = rule.parse(input)) != :unmatched + while (max == '*' || result.length < max) && (res = rule.parse(input, **options.merge(_rept_data: result))) != :unmatched eat_whitespace(input) unless terminal? result << res end when String + # FIXME: don't match a string, if input matches a terminal while (res = input.scan(Regexp.new(Regexp.quote(prod), string_regexp_opts))) && (max == '*' || result.length < max) eat_whitespace(input) unless terminal? result << case options[:insensitive_strings] @@ -263,6 +286,16 @@ def rept(input, min, max, prod, string_regexp_opts, **options) result.length < min ? :unmatched : result.compact end + ## + # See if a terminal could have a longer match than a string + def terminal_also_matches(input, prod, string_regexp_opts) + str_regex = Regexp.new(Regexp.quote(prod), string_regexp_opts) + input.match?(str_regex) && parser.class.terminal_regexps.any? do |sym, re| + re = re.call() if re.is_a?(Proc) + (match_len = input.match?(re)) && match_len > prod.length + end + end + ## # Eat whitespace between non-terminal rules def eat_whitespace(input) diff --git a/lib/ebnf/terminals.rb b/lib/ebnf/terminals.rb index 24e498b..861aa02 100644 --- a/lib/ebnf/terminals.rb +++ b/lib/ebnf/terminals.rb @@ -1,13 +1,14 @@ # encoding: utf-8 # Terminal definitions for the EBNF grammar module EBNF::Terminals - SYMBOL_BASE = %r(\b[a-zA-Z0-9_\.]+\b)u.freeze - SYMBOL = %r(#{SYMBOL_BASE}(?!\s*::=))u.freeze + SYMBOL_BASE = %r(\b[a-zA-Z0-9_\.]+\b)u.freeze # Word boundaries + SYMBOL = %r((?:#{SYMBOL_BASE}|(?:<#{SYMBOL_BASE}>))(?!\s*::=))u.freeze HEX = %r(\#x\h+)u.freeze CHAR = %r([\u0009\u000A\u000D\u0020-\uD7FF\u{10000}-\u{10FFFF}])u.freeze R_CHAR = %r([\u0009\u000A\u000D\u0020-\u002C\u002E-\u005C\u005E-\uD7FF\u{10000}-\u{10FFFF}])u.freeze - RANGE = %r(\[(?:(?:#{R_CHAR}\-#{R_CHAR})|(?:#{HEX}\-#{HEX})|#{R_CHAR}|#{HEX})+-?\](?!\s+#{SYMBOL_BASE}\s*::=))u.freeze - LHS = %r((?:\[#{SYMBOL_BASE}\])?\s*#{SYMBOL_BASE}\s*::=)u.freeze + LHS = %r((?:\[#{SYMBOL_BASE}\])?\s*?\s*::=)u.freeze + RANGE = %r(\[(?:(?:#{R_CHAR}\-#{R_CHAR})|(?:#{HEX}\-#{HEX})|#{R_CHAR}|#{HEX})+-?\])u.freeze + RANGE_NOT_LHS = %r(\[(?:(?:#{R_CHAR}\-#{R_CHAR})|(?:#{HEX}\-#{HEX})|#{R_CHAR}|#{HEX})+-?\](?!\s*?\s*::=))u.freeze O_RANGE = %r(\[\^(?:(?:#{R_CHAR}\-#{R_CHAR})|(?:#{HEX}\-#{HEX}|#{R_CHAR}|#{HEX}))+-?\])u.freeze STRING1 = %r("[\u0009\u000A\u000D\u0020\u0021\u0023-\uD7FF\u{10000}-\u{10FFFF}]*")u.freeze STRING2 = %r('[\u0009\u000A\u000D\u0020-\u0026\u0028-\uD7FF\u{10000}-\u{10FFFF}]*')u.freeze diff --git a/spec/base_spec.rb b/spec/base_spec.rb index 37af3fb..001d81d 100644 --- a/spec/base_spec.rb +++ b/spec/base_spec.rb @@ -14,9 +14,9 @@ %{((rule Prolog "2" (seq (opt BaseDecl) (star PrefixDecl))))}, %{ @terminals - [3] terminal ::= [A-Z]+ + terminal ::= [A-Z]+ } => %{((terminals _terminals (seq)) - (terminal terminal "3" (plus (range "A-Z"))))}, + (terminal terminal (plus (range "A-Z"))))}, %{ [9] primary ::= HEX | RANGE @@ -125,11 +125,11 @@ end describe "#to_s" do - specify {expect(subject.to_s).to include("[1] ebnf")} + specify {expect(subject.to_s).to include("ebnf")} end describe "#to_html" do - specify {expect(subject.to_s).to include("[1] ebnf")} + specify {expect(subject.to_s).to include("ebnf")} end describe "#to_ruby" do diff --git a/spec/ebnf_spec.rb b/spec/ebnf_spec.rb index 20b93b2..cb41be5 100644 --- a/spec/ebnf_spec.rb +++ b/spec/ebnf_spec.rb @@ -30,6 +30,14 @@ | STRING2 | '(' expression ')' } => %{((rule primary (alt HEX RANGE O_RANGE STRING1 STRING2 (seq '(' expression ')'))))}, + %{ + ::= + | + | + | + | + | '(' ')' + } => %{((rule primary (alt HEX RANGE O_RANGE STRING1 STRING2 (seq '(' expression ')'))))}, }.each do |input, expected| context input do subject {EBNF.parse(input)} diff --git a/spec/ll1_spec.rb b/spec/ll1_spec.rb index 1abdec7..cebc6d6 100644 --- a/spec/ll1_spec.rb +++ b/spec/ll1_spec.rb @@ -5,6 +5,11 @@ require 'sxp' describe EBNF::Base do + let(:logger) {RDF::Spec.logger} + after(:each) do |example| + puts logger.to_s if example.exception && !example.exception.is_a?(RSpec::Expectations::ExpectationNotMetError) + end + describe "#first_follow" do context "start" do context "with legitimate start rule" do @@ -421,11 +426,8 @@ end def parse(value, **options) - @debug = [] - options = {debug: @debug}.merge(options) ebnf = EBNF::Base.new(value, **options) ebnf.make_bnf - @debug.clear ebnf.first_follow(options[:start]) ebnf end diff --git a/spec/native_spec.rb b/spec/native_spec.rb index 7fd612c..eb9750a 100644 --- a/spec/native_spec.rb +++ b/spec/native_spec.rb @@ -60,6 +60,10 @@ %{Prolog ::= BaseDecl? PrefixDecl*}, %{((rule Prolog (seq (opt BaseDecl) (star PrefixDecl))))} ], + "prolog (with brackets)": [ + %{ ::= ? *}, + %{((rule Prolog (seq (opt BaseDecl) (star PrefixDecl))))} + ], "aliteration": [ %{declaration ::= '@terminals' | '@pass'}, %{((rule declaration (alt '@terminals' '@pass')))}, @@ -111,7 +115,7 @@ %{NCCHAR1 | '-' | [0-9] | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040]} => %{(alt NCCHAR1 '-' (range "0-9") (hex "#x00B7") (range "#x0300-#x036F") (range "#x203F-#x2040"))}, %{'<' ([^<>"{}|^`\]-[#x00-#x20] | UCHAR)* '>'} => - %{(seq '<' (star (alt (diff (range "^<>\\\"{}|^`") (range "#x00-#x20")) UCHAR)) '>')} + %{(seq '<' (star (alt (diff (range "^<>\\\"{}|^`") (range "#x00-#x20")) UCHAR)) '>')}, }.each do |input, expected| it "given #{input.inspect} produces #{expected}" do rule = parse("rule ::= #{input}").ast.first diff --git a/spec/parser_spec.rb b/spec/parser_spec.rb index 3213000..db68f14 100644 --- a/spec/parser_spec.rb +++ b/spec/parser_spec.rb @@ -43,9 +43,9 @@ '>')))}, ], "minimal whitespace": [ - %{[xx]minimal::=whitespace[yy]whitespace::=PASS}, - %{((rule minimal "xx" (seq whitespace (range "yy"))) - (rule whitespace (seq PASS)))} + %{[xx]minimal::=whitespace[yy]whitespace::=" "}, + %{((rule minimal "xx" (seq whitespace)) + (rule whitespace "yy" (seq " ")))} ] }.each do |title, (input, expect)| it title do diff --git a/spec/peg/data/parser.rb b/spec/peg/data/parser.rb index 2fe4807..e462615 100644 --- a/spec/peg/data/parser.rb +++ b/spec/peg/data/parser.rb @@ -26,7 +26,7 @@ class EBNFPegParser terminal(:HEX, HEX) - terminal(:RANGE, RANGE, unescape: true) do |value| + terminal(:RANGE, RANGE_NOT_LHS, unescape: true) do |value| [:range, value[1..-2]] end diff --git a/spec/peg/rule_spec.rb b/spec/peg/rule_spec.rb index edd86d7..3c4a4f2 100644 --- a/spec/peg/rule_spec.rb +++ b/spec/peg/rule_spec.rb @@ -5,7 +5,10 @@ describe EBNF::PEG::Rule do describe "#parse" do - let(:parser) {double("PEG Parser", whitespace: /\s+/, packrat: {}, update_furthest_failure: true)} + let(:parser_class) {double("PEG Parser Class", terminal_regexps: {})} + let(:parser) { + double("PEG Parser", whitespace: /\s+/, packrat: {}, update_furthest_failure: true, class: parser_class) + } context "non-terminal rules" do { diff --git a/spec/peg_spec.rb b/spec/peg_spec.rb index c8ad6e2..2a80638 100644 --- a/spec/peg_spec.rb +++ b/spec/peg_spec.rb @@ -20,8 +20,28 @@ | '(' expression ')' } => - %{((rule primary "9" (alt HEX RANGE O_RANGE STRING1 STRING2 _primary_1)) - (rule _primary_1 "9.1" (seq '(' expression ')')))}, + %{((rule primary "9" (alt HEX RANGE O_RANGE STRING1 STRING2 _primary_1)) + (rule _primary_1 "9.1" (seq '(' expression ')')))}, + %{ + primary ::= HEX + | RANGE + | O_RANGE + | STRING1 + | STRING2 + | '(' expression ')' + } => + %{((rule primary (alt HEX RANGE O_RANGE STRING1 STRING2 _primary_1)) + (rule _primary_1 (seq '(' expression ')')))}, + %{ + ::= + | + | + | + | + | '(' ')' + } => + %{((rule primary (alt HEX RANGE O_RANGE STRING1 STRING2 _primary_1)) + (rule _primary_1 (seq '(' expression ')')))}, %{[1] start ::= A B C} => %{((rule start "1" (seq A B C)))}, %{[1] start ::= A B? C* D+} => diff --git a/spec/rule_spec.rb b/spec/rule_spec.rb index eba5e97..7e6a269 100644 --- a/spec/rule_spec.rb +++ b/spec/rule_spec.rb @@ -877,7 +877,8 @@ primary: [:HEX, :SYMBOL, :O_RANGE, :RANGE, :STRING1, :STRING2, "("], pass: ["@pass"], LHS: ["["], - SYMBOL: ["a-z", "A-Z", "0-9", "_", "."], + SYMBOL: ["<", :O_SYMBOL], + O_SYMBOL: ["a-z", "A-Z", "0-9", "_", "."], HEX: ["#x"], RANGE: ["["], O_RANGE: ["[^"], @@ -908,9 +909,10 @@ primary: [:HEX, :SYMBOL, :O_RANGE, :RANGE, :STRING1, :STRING2, :expression], pass: [:expression], LHS: [:SYMBOL], - SYMBOL: [], + SYMBOL: [:O_SYMBOL], + O_SYMBOL: [], HEX: [], - RANGE: [:R_CHAR, :HEX, :LHS], + RANGE: [:R_CHAR, :HEX], O_RANGE: [:R_CHAR, :HEX], STRING1: [:CHAR], STRING2: [:CHAR],
[12] SYMBOL ::=('<' O_SYMBOL '>') | O_SYMBOL
[12a]O_SYMBOL::= ([a-z] | [A-Z] | [0-9] | '_' | '.')+
[14] RANGE ::='[' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? (']' - LHS)'[' ((R_CHAR '-' R_CHAR) | (HEX '-' HEX) | R_CHAR | HEX)+ '-'? ']'
[15]