From 45d95eab76a9b78fcf24e4d65a2764dd4908d897 Mon Sep 17 00:00:00 2001 From: Tim Morgan Date: Sun, 16 Jun 2024 08:41:03 -0500 Subject: [PATCH] Add support for regexp passed to String#index --- include/natalie/string_object.hpp | 10 +- spec/core/string/index_spec.rb | 355 ++++++++++++++++++++++++++++++ spec/core/string/match_spec.rb | 10 +- spec/shared/string/end_with.rb | 8 +- src/regexp_object.cpp | 2 + src/string_object.cpp | 60 +++-- 6 files changed, 407 insertions(+), 38 deletions(-) create mode 100644 spec/core/string/index_spec.rb diff --git a/include/natalie/string_object.hpp b/include/natalie/string_object.hpp index 95c7955cf..1d15a6121 100644 --- a/include/natalie/string_object.hpp +++ b/include/natalie/string_object.hpp @@ -273,9 +273,9 @@ class StringObject : public Object { StringObject *successive(Env *); StringObject *successive_in_place(Env *); - Value index(Env *, Value, Value) const; - Value index(Env *, Value, size_t start) const; - nat_int_t index_int(Env *, Value, size_t byte_start) const; + Value index(Env *, Value, Value); + Value index(Env *, Value, size_t start); + nat_int_t index_int(Env *, Value, size_t byte_start); Value rindex(Env *, Value) const; @@ -396,8 +396,8 @@ class StringObject : public Object { static size_t byte_index_to_char_index(ArrayObject *chars, size_t byte_index); - size_t char_index_to_byte_index(size_t) const; - size_t byte_index_to_char_index(size_t) const; + ssize_t char_index_to_byte_index(size_t) const; + ssize_t byte_index_to_char_index(size_t) const; static CaseMapType check_case_options(Env *env, Value arg1, Value arg2, bool downcase = false); diff --git a/spec/core/string/index_spec.rb b/spec/core/string/index_spec.rb new file mode 100644 index 000000000..bda38c78c --- /dev/null +++ b/spec/core/string/index_spec.rb @@ -0,0 +1,355 @@ +# -*- encoding: utf-8 -*- +require_relative '../../spec_helper' +require_relative 'fixtures/classes' + +describe "String#index" do + it "raises a TypeError if passed nil" do + -> { "abc".index nil }.should raise_error(TypeError) + end + + it "raises a TypeError if passed a boolean" do + -> { "abc".index true }.should raise_error(TypeError) + end + + it "raises a TypeError if passed a Symbol" do + -> { "abc".index :a }.should raise_error(TypeError) + end + + it "calls #to_str to convert the first argument" do + char = mock("string index char") + char.should_receive(:to_str).and_return("b") + "abc".index(char).should == 1 + end + + it "calls #to_int to convert the second argument" do + offset = mock("string index offset") + offset.should_receive(:to_int).and_return(1) + "abc".index("c", offset).should == 2 + end + + it "raises a TypeError if passed an Integer" do + -> { "abc".index 97 }.should raise_error(TypeError) + end +end + +describe "String#index with String" do + it "behaves the same as String#index(char) for one-character strings" do + "blablabla hello cruel world...!".split("").uniq.each do |str| + chr = str[0] + str.index(str).should == str.index(chr) + + 0.upto(str.size + 1) do |start| + str.index(str, start).should == str.index(chr, start) + end + + (-str.size - 1).upto(-1) do |start| + str.index(str, start).should == str.index(chr, start) + end + end + end + + it "returns the index of the first occurrence of the given substring" do + "blablabla".index("").should == 0 + "blablabla".index("b").should == 0 + "blablabla".index("bla").should == 0 + "blablabla".index("blabla").should == 0 + "blablabla".index("blablabla").should == 0 + + "blablabla".index("l").should == 1 + "blablabla".index("la").should == 1 + "blablabla".index("labla").should == 1 + "blablabla".index("lablabla").should == 1 + + "blablabla".index("a").should == 2 + "blablabla".index("abla").should == 2 + "blablabla".index("ablabla").should == 2 + end + + it "doesn't set $~" do + $~ = nil + + 'hello.'.index('ll') + $~.should == nil + end + + it "ignores string subclasses" do + "blablabla".index(StringSpecs::MyString.new("bla")).should == 0 + StringSpecs::MyString.new("blablabla").index("bla").should == 0 + StringSpecs::MyString.new("blablabla").index(StringSpecs::MyString.new("bla")).should == 0 + end + + it "starts the search at the given offset" do + "blablabla".index("bl", 0).should == 0 + "blablabla".index("bl", 1).should == 3 + "blablabla".index("bl", 2).should == 3 + "blablabla".index("bl", 3).should == 3 + + "blablabla".index("bla", 0).should == 0 + "blablabla".index("bla", 1).should == 3 + "blablabla".index("bla", 2).should == 3 + "blablabla".index("bla", 3).should == 3 + + "blablabla".index("blab", 0).should == 0 + "blablabla".index("blab", 1).should == 3 + "blablabla".index("blab", 2).should == 3 + "blablabla".index("blab", 3).should == 3 + + "blablabla".index("la", 1).should == 1 + "blablabla".index("la", 2).should == 4 + "blablabla".index("la", 3).should == 4 + "blablabla".index("la", 4).should == 4 + + "blablabla".index("lab", 1).should == 1 + "blablabla".index("lab", 2).should == 4 + "blablabla".index("lab", 3).should == 4 + "blablabla".index("lab", 4).should == 4 + + "blablabla".index("ab", 2).should == 2 + "blablabla".index("ab", 3).should == 5 + "blablabla".index("ab", 4).should == 5 + "blablabla".index("ab", 5).should == 5 + + "blablabla".index("", 0).should == 0 + + "blablabla".index("", 1).should == 1 + "blablabla".index("", 2).should == 2 + "blablabla".index("", 7).should == 7 + "blablabla".index("", 8).should == 8 + "blablabla".index("", 9).should == 9 + end + + it "starts the search at offset + self.length if offset is negative" do + str = "blablabla" + + ["bl", "bla", "blab", "la", "lab", "ab", ""].each do |needle| + (-str.length .. -1).each do |offset| + str.index(needle, offset).should == + str.index(needle, offset + str.length) + end + end + end + + it "returns nil if the substring isn't found" do + "blablabla".index("B").should == nil + "blablabla".index("z").should == nil + "blablabla".index("BLA").should == nil + "blablabla".index("blablablabla").should == nil + "blablabla".index("", 10).should == nil + + "hello".index("he", 1).should == nil + "hello".index("he", 2).should == nil + "I’ve got a multibyte character.\n".index("\n\n").should == nil + end + + it "returns the character index of a multibyte character" do + "ありがとう".index("が").should == 2 + end + + it "returns the character index after offset" do + "われわれ".index("わ", 1).should == 2 + "ありがとうありがとう".index("が", 3).should == 7 + end + + it "returns the character index after a partial first match" do + " do + "あれ".index char + end.should raise_error(Encoding::CompatibilityError) + end + + it "handles a substring in a superset encoding" do + 'abc'.dup.force_encoding(Encoding::US_ASCII).index('é').should == nil + end + + it "handles a substring in a subset encoding" do + 'été'.index('t'.dup.force_encoding(Encoding::US_ASCII)).should == 1 + end + + it "raises an Encoding::CompatibilityError if the encodings are incompatible" do + NATFIXME 'Implement ISO-2022-JP', exception: ArgumentError, message: 'unknown encoding name - "ISO-2022-JP"' do + str = 'abc'.dup.force_encoding("ISO-2022-JP") + pattern = 'b'.dup.force_encoding("EUC-JP") + + -> { str.index(pattern) }.should raise_error(Encoding::CompatibilityError, "incompatible character encodings: ISO-2022-JP and EUC-JP") + end + end +end + +describe "String#index with Regexp" do + it "behaves the same as String#index(string) for escaped string regexps" do + ["blablabla", "hello cruel world...!"].each do |str| + ["", "b", "bla", "lab", "o c", "d."].each do |needle| + regexp = Regexp.new(Regexp.escape(needle)) + str.index(regexp).should == str.index(needle) + + 0.upto(str.size + 1) do |start| + str.index(regexp, start).should == str.index(needle, start) + end + + (-str.size - 1).upto(-1) do |start| + str.index(regexp, start).should == str.index(needle, start) + end + end + end + end + + it "returns the index of the first match of regexp" do + "blablabla".index(/bla/).should == 0 + "blablabla".index(/BLA/i).should == 0 + + "blablabla".index(/.{0}/).should == 0 + "blablabla".index(/.{6}/).should == 0 + "blablabla".index(/.{9}/).should == 0 + + "blablabla".index(/.*/).should == 0 + "blablabla".index(/.+/).should == 0 + + "blablabla".index(/lab|b/).should == 0 + + not_supported_on :opal do + "blablabla".index(/\A/).should == 0 + "blablabla".index(/\Z/).should == 9 + "blablabla".index(/\z/).should == 9 + "blablabla\n".index(/\Z/).should == 9 + "blablabla\n".index(/\z/).should == 10 + end + + "blablabla".index(/^/).should == 0 + "\nblablabla".index(/^/).should == 0 + "b\nablabla".index(/$/).should == 1 + "bl\nablabla".index(/$/).should == 2 + + "blablabla".index(/.l./).should == 0 + end + + it "sets $~ to MatchData of match and nil when there's none" do + 'hello.'.index(/.(.)/) + $~[0].should == 'he' + + 'hello.'.index(/not/) + $~.should == nil + end + + ruby_bug "#20421", ""..."3.3" do + it "always clear $~" do + "a".index(/a/) + $~.should_not == nil + + string = "blablabla" + string.index(/bla/, string.length + 1) + $~.should == nil + end + end + + it "starts the search at the given offset" do + "blablabla".index(/.{0}/, 5).should == 5 + "blablabla".index(/.{1}/, 5).should == 5 + "blablabla".index(/.{2}/, 5).should == 5 + "blablabla".index(/.{3}/, 5).should == 5 + "blablabla".index(/.{4}/, 5).should == 5 + + "blablabla".index(/.{0}/, 3).should == 3 + "blablabla".index(/.{1}/, 3).should == 3 + "blablabla".index(/.{2}/, 3).should == 3 + "blablabla".index(/.{5}/, 3).should == 3 + "blablabla".index(/.{6}/, 3).should == 3 + + "blablabla".index(/.l./, 0).should == 0 + "blablabla".index(/.l./, 1).should == 3 + "blablabla".index(/.l./, 2).should == 3 + "blablabla".index(/.l./, 3).should == 3 + + "xblaxbla".index(/x./, 0).should == 0 + "xblaxbla".index(/x./, 1).should == 4 + "xblaxbla".index(/x./, 2).should == 4 + + not_supported_on :opal do + "blablabla\n".index(/\Z/, 9).should == 9 + end + end + + it "starts the search at offset + self.length if offset is negative" do + str = "blablabla" + + ["bl", "bla", "blab", "la", "lab", "ab", ""].each do |needle| + (-str.length .. -1).each do |offset| + str.index(needle, offset).should == + str.index(needle, offset + str.length) + end + end + end + + it "returns nil if the substring isn't found" do + "blablabla".index(/BLA/).should == nil + + "blablabla".index(/.{10}/).should == nil + "blaxbla".index(/.x/, 3).should == nil + "blaxbla".index(/..x/, 2).should == nil + end + + it "returns nil if the Regexp matches the empty string and the offset is out of range" do + "ruby".index(//,12).should be_nil + end + + it "supports \\G which matches at the given start offset" do + "helloYOU.".index(/\GYOU/, 5).should == 5 + "helloYOU.".index(/\GYOU/).should == nil + + re = /\G.+YOU/ + # The # marks where \G will match. + [ + ["#hi!YOUall.", 0], + ["h#i!YOUall.", 1], + ["hi#!YOUall.", 2], + ["hi!#YOUall.", nil] + ].each do |spec| + + start = spec[0].index("#") + str = spec[0].delete("#") + + str.index(re, start).should == spec[1] + end + end + + it "converts start_offset to an integer via to_int" do + obj = mock('1') + obj.should_receive(:to_int).and_return(1) + "RWOARW".index(/R./, obj).should == 4 + end + + it "returns the character index of a multibyte character" do + "ありがとう".index(/が/).should == 2 + end + + it "returns the character index after offset" do + "われわれ".index(/わ/, 1).should == 2 + end + + it "treats the offset as a character index" do + "われわわれ".index(/わ/, 3).should == 3 + end + + ruby_bug "#19763", ""..."3.3.0" do + it "raises an Encoding::CompatibilityError if the encodings are incompatible" do + re = Regexp.new "れ".encode(Encoding::EUC_JP) + -> do + "あれ".index re + end.should raise_error(Encoding::CompatibilityError, "incompatible encoding regexp match (EUC-JP regexp with UTF-8 string)") + end + end + + # The exception message was incorrectly "incompatible character encodings: UTF-8 and EUC-JP" before 3.3.0 + # Still test that the right exception class is used before that. + it "raises an Encoding::CompatibilityError if the encodings are incompatible" do + re = Regexp.new "れ".encode(Encoding::EUC_JP) + NATFIXME 'Implement real encodings on Regexp', exception: SpecFailedException, message: /Encoding::CompatibilityError/ do + -> do + "あれ".index re + end.should raise_error(Encoding::CompatibilityError) + end + end +end diff --git a/spec/core/string/match_spec.rb b/spec/core/string/match_spec.rb index d0ff6a1d6..339ca0e60 100644 --- a/spec/core/string/match_spec.rb +++ b/spec/core/string/match_spec.rb @@ -12,12 +12,10 @@ describe "String#=~" do it "behaves the same way as index() when given a regexp" do - NATFIXME 'Implement Regexp argument in String#index', exception: TypeError, message: 'no implicit conversion of Regexp into String' do - ("rudder" =~ /udder/).should == "rudder".index(/udder/) - ("boat" =~ /[^fl]oat/).should == "boat".index(/[^fl]oat/) - ("bean" =~ /bag/).should == "bean".index(/bag/) - ("true" =~ /false/).should == "true".index(/false/) - end + ("rudder" =~ /udder/).should == "rudder".index(/udder/) + ("boat" =~ /[^fl]oat/).should == "boat".index(/[^fl]oat/) + ("bean" =~ /bag/).should == "bean".index(/bag/) + ("true" =~ /false/).should == "true".index(/false/) end it "raises a TypeError if a obj is a string" do diff --git a/spec/shared/string/end_with.rb b/spec/shared/string/end_with.rb index 2233a864b..c88980930 100644 --- a/spec/shared/string/end_with.rb +++ b/spec/shared/string/end_with.rb @@ -48,11 +48,9 @@ it "raises an Encoding::CompatibilityError if the encodings are incompatible" do pat = "ア".encode Encoding::EUC_JP - NATFIXME 'Raise Encoding::CompatibilityError', exception: SpecFailedException do - -> do - "あれ".send(@method).end_with?(pat) - end.should raise_error(Encoding::CompatibilityError) - end + -> do + "あれ".send(@method).end_with?(pat) + end.should raise_error(Encoding::CompatibilityError) end it "checks that we are starting to match at the head of a character" do diff --git a/src/regexp_object.cpp b/src/regexp_object.cpp index 9b8ed294c..1e7208d4e 100644 --- a/src/regexp_object.cpp +++ b/src/regexp_object.cpp @@ -403,6 +403,7 @@ Value RegexpObject::match(Env *env, Value other, Value start, Block *block) { start_byte_index = start->as_integer_or_raise(env)->to_nat_int_t(); if (start_byte_index < 0) { + // FIXME: move this logic to StringObject::char_index_to_byte_index size_t byte_index = str_obj->bytesize(); ssize_t char_index = 0; TM::StringView view; @@ -412,6 +413,7 @@ Value RegexpObject::match(Env *env, Value other, Value start, Block *block) { } while (byte_index != 0 && start_byte_index < char_index); start_byte_index = byte_index; } else { + // FIXME: use StringObject::char_index_to_byte_index size_t byte_index = 0; ssize_t char_index = 0; TM::StringView view; diff --git a/src/string_object.cpp b/src/string_object.cpp index b4f80e65a..37bfc3697 100644 --- a/src/string_object.cpp +++ b/src/string_object.cpp @@ -635,7 +635,7 @@ bool StringObject::end_with(Env *env, Args args) const { return false; } -Value StringObject::index(Env *env, Value needle, Value offset) const { +Value StringObject::index(Env *env, Value needle, Value offset) { int offset_i = (offset) ? IntegerObject::convert_to_int(env, offset) : 0; int len = char_count(env); if (offset_i < -1 * len) { @@ -648,29 +648,45 @@ Value StringObject::index(Env *env, Value needle, Value offset) const { return index(env, needle, ::abs(offset_i)); } -// TODO: Handle Regexp needle case -Value StringObject::index(Env *env, Value needle, size_t start) const { - auto byte_index = index_int(env, needle, start); - if (byte_index == -1) { +Value StringObject::index(Env *env, Value needle, size_t start) { + auto byte_start = char_index_to_byte_index(start); + if (byte_start == -1) return NilObject::the(); - } - size_t byte_index_size_t = static_cast(byte_index); - size_t char_index = 0, index = 0; - auto c = next_char(&index); - while (!c.is_empty()) { - if (index > byte_index_size_t) - return IntegerObject::from_size_t(env, char_index); - char_index++; - c = next_char(&index); - } - return Value::integer(0); + auto byte_index = index_int(env, needle, byte_start); + if (byte_index == -1) + return NilObject::the(); + auto char_index = byte_index_to_char_index(byte_index); + return IntegerObject::from_size_t(env, char_index); } -nat_int_t StringObject::index_int(Env *env, Value needle, size_t byte_start) const { +nat_int_t StringObject::index_int(Env *env, Value needle, size_t byte_start) { + if (needle->is_regexp()) { + if (needle->as_regexp()->pattern().is_empty()) + return byte_start; + + if (bytesize() == 0) + return -1; + + if (byte_start >= bytesize()) + return -1; + + OnigRegion *region = onig_region_new(); + int result = needle->as_regexp()->search(string(), byte_start, region, ONIG_OPTION_NONE); + if (result == ONIG_MISMATCH) { + env->caller()->set_last_match(nullptr); + return -1; + } + + auto match = new MatchDataObject { region, this, needle->as_regexp() }; + env->caller()->set_last_match(match); + return region->beg[0]; + } + auto needle_str = needle->to_str(env)->as_string(); + assert_compatible_string(env, needle_str); if (needle_str->bytesize() == 0) - return 0; + return byte_start; if (bytesize() == 0) return -1; @@ -1847,7 +1863,7 @@ size_t StringObject::byte_index_to_char_index(ArrayObject *chars, size_t byte_in return char_index; } -size_t StringObject::char_index_to_byte_index(size_t char_index) const { +ssize_t StringObject::char_index_to_byte_index(size_t char_index) const { if (m_encoding->is_single_byte_encoding()) return char_index; @@ -1857,12 +1873,12 @@ size_t StringObject::char_index_to_byte_index(size_t char_index) const { while (char_index > current_char_index) { view = next_char(¤t_byte_index); current_char_index++; - if (view.is_empty()) break; + if (view.is_empty()) return -1; } return current_byte_index; } -size_t StringObject::byte_index_to_char_index(size_t byte_index) const { +ssize_t StringObject::byte_index_to_char_index(size_t byte_index) const { if (m_encoding->is_single_byte_encoding()) return byte_index; @@ -1872,7 +1888,7 @@ size_t StringObject::byte_index_to_char_index(size_t byte_index) const { while (byte_index > current_byte_index) { view = next_char(¤t_byte_index); current_char_index++; - if (view.is_empty()) break; + if (view.is_empty()) return -1; } return current_char_index; }