From 6bff819630fd536ea4c071c27ca8f8c6b30dad0b Mon Sep 17 00:00:00 2001 From: Gregg Kellogg Date: Wed, 25 Aug 2021 16:28:45 -0700 Subject: [PATCH] Add back class method aliases for LL1::Lexer,unescape_codepoints and unescape_string. --- lib/ebnf/ll1/lexer.rb | 24 +++++++++++++++ spec/ll1/lexer_spec.rb | 69 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) diff --git a/lib/ebnf/ll1/lexer.rb b/lib/ebnf/ll1/lexer.rb index 65520d4..3aa8a73 100644 --- a/lib/ebnf/ll1/lexer.rb +++ b/lib/ebnf/ll1/lexer.rb @@ -38,6 +38,30 @@ class Lexer # @return [Regexp] defines whitespace, including comments, otherwise whitespace must be explicit in terminals attr_reader :whitespace + ## + # Returns a copy of the given `input` string with all `\uXXXX` and + # `\UXXXXXXXX` Unicode codepoint escape sequences replaced with their + # unescaped UTF-8 character counterparts. + # + # @param [String] string + # @return [String] + # @see https://www.w3.org/TR/rdf-sparql-query/#codepointEscape + def self.unescape_codepoints(string) + ::EBNF::Unescape.unescape_codepoints(string) + end + + ## + # Returns a copy of the given `input` string with all string escape + # sequences (e.g. `\n` and `\t`) replaced with their unescaped UTF-8 + # character counterparts. + # + # @param [String] input + # @return [String] + # @see https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes + def self.unescape_string(input) + ::EBNF::Unescape.unescape_string(input) + end + ## # Tokenizes the given `input` string or stream. # diff --git a/spec/ll1/lexer_spec.rb b/spec/ll1/lexer_spec.rb index 845feb0..76354f9 100644 --- a/spec/ll1/lexer_spec.rb +++ b/spec/ll1/lexer_spec.rb @@ -25,6 +25,75 @@ :STRING_LITERAL_QUOTE, :STRING_LITERAL_SINGLE_QUOTE, :STRING_LITERAL_LONG_SINGLE_QUOTE, :STRING_LITERAL_LONG_QUOTE ]} + + describe ".unescape_codepoints" do + # @see https://www.w3.org/TR/rdf-sparql-query/#codepointEscape + + it "unescapes \\uXXXX codepoint escape sequences" do + inputs = { + %q(\\u0020) => %q( ), + %q() => %Q(), + %q(\\u03B1:a) => %Q(\xCE\xB1:a), + %q(a\\u003Ab) => %Q(a\x3Ab), + } + inputs.each do |input, output| + output.force_encoding(Encoding::UTF_8) + expect(EBNF::LL1::Lexer.unescape_codepoints(input)).to eq output + end + end + + it "unescapes \\UXXXXXXXX codepoint escape sequences" do + inputs = { + %q(\\U00000020) => %q( ), + %q(\\U00010000) => %Q(\xF0\x90\x80\x80), + %q(\\U000EFFFF) => %Q(\xF3\xAF\xBF\xBF), + } + inputs.each do |input, output| + output.force_encoding(Encoding::UTF_8) + expect(EBNF::LL1::Lexer.unescape_codepoints(input)).to eq output + end + end + + context "escaped strings" do + { + 'Dürst' => 'D\\u00FCrst', + "é" => '\\u00E9', + "€" => '\\u20AC', + "resumé" => 'resum\\u00E9', + }.each_pair do |unescaped, escaped| + it "unescapes #{unescaped.inspect}" do + expect(EBNF::LL1::Lexer.unescape_codepoints(escaped)).to eq unescaped + end + end + end + end + + describe ".unescape_string" do + # @see https://www.w3.org/TR/rdf-sparql-query/#grammarEscapes + + context "escape sequences" do + EBNF::LL1::Lexer::ESCAPE_CHARS.each do |escaped, unescaped| + it "unescapes #{unescaped.inspect}" do + expect(EBNF::LL1::Lexer.unescape_string(escaped)).to eq unescaped + end + end + end + + context "escaped strings" do + { + 'simple literal' => 'simple literal', + 'backslash:\\' => 'backslash:\\\\', + 'dquote:"' => 'dquote:\\"', + "newline:\n" => 'newline:\\n', + "return\r" => 'return\\r', + "tab:\t" => 'tab:\\t', + }.each_pair do |unescaped, escaped| + it "unescapes #{unescaped.inspect}" do + expect(EBNF::LL1::Lexer.unescape_string(escaped)).to eq unescaped + end + end + end + end describe ".tokenize" do context "numeric literals" do