From 45d95eab76a9b78fcf24e4d65a2764dd4908d897 Mon Sep 17 00:00:00 2001
From: Tim Morgan <tim@timmorgan.org>
Date: Sun, 16 Jun 2024 08:41:03 -0500
Subject: [PATCH] Add support for regexp passed to String#index

---
 include/natalie/string_object.hpp |  10 +-
 spec/core/string/index_spec.rb    | 355 ++++++++++++++++++++++++++++++
 spec/core/string/match_spec.rb    |  10 +-
 spec/shared/string/end_with.rb    |   8 +-
 src/regexp_object.cpp             |   2 +
 src/string_object.cpp             |  60 +++--
 6 files changed, 407 insertions(+), 38 deletions(-)
 create mode 100644 spec/core/string/index_spec.rb

diff --git a/include/natalie/string_object.hpp b/include/natalie/string_object.hpp
index 95c7955cf..1d15a6121 100644
--- a/include/natalie/string_object.hpp
+++ b/include/natalie/string_object.hpp
@@ -273,9 +273,9 @@ class StringObject : public Object {
     StringObject *successive(Env *);
     StringObject *successive_in_place(Env *);
 
-    Value index(Env *, Value, Value) const;
-    Value index(Env *, Value, size_t start) const;
-    nat_int_t index_int(Env *, Value, size_t byte_start) const;
+    Value index(Env *, Value, Value);
+    Value index(Env *, Value, size_t start);
+    nat_int_t index_int(Env *, Value, size_t byte_start);
 
     Value rindex(Env *, Value) const;
 
@@ -396,8 +396,8 @@ class StringObject : public Object {
 
     static size_t byte_index_to_char_index(ArrayObject *chars, size_t byte_index);
 
-    size_t char_index_to_byte_index(size_t) const;
-    size_t byte_index_to_char_index(size_t) const;
+    ssize_t char_index_to_byte_index(size_t) const;
+    ssize_t byte_index_to_char_index(size_t) const;
 
     static CaseMapType check_case_options(Env *env, Value arg1, Value arg2, bool downcase = false);
 
diff --git a/spec/core/string/index_spec.rb b/spec/core/string/index_spec.rb
new file mode 100644
index 000000000..bda38c78c
--- /dev/null
+++ b/spec/core/string/index_spec.rb
@@ -0,0 +1,355 @@
+# -*- encoding: utf-8 -*-
+require_relative '../../spec_helper'
+require_relative 'fixtures/classes'
+
+describe "String#index" do
+  it "raises a TypeError if passed nil" do
+    -> { "abc".index nil }.should raise_error(TypeError)
+  end
+
+  it "raises a TypeError if passed a boolean" do
+    -> { "abc".index true }.should raise_error(TypeError)
+  end
+
+  it "raises a TypeError if passed a Symbol" do
+    -> { "abc".index :a }.should raise_error(TypeError)
+  end
+
+  it "calls #to_str to convert the first argument" do
+    char = mock("string index char")
+    char.should_receive(:to_str).and_return("b")
+    "abc".index(char).should == 1
+  end
+
+  it "calls #to_int to convert the second argument" do
+    offset = mock("string index offset")
+    offset.should_receive(:to_int).and_return(1)
+    "abc".index("c", offset).should == 2
+  end
+
+  it "raises a TypeError if passed an Integer" do
+    -> { "abc".index 97 }.should raise_error(TypeError)
+  end
+end
+
+describe "String#index with String" do
+  it "behaves the same as String#index(char) for one-character strings" do
+    "blablabla hello cruel world...!".split("").uniq.each do |str|
+      chr = str[0]
+      str.index(str).should == str.index(chr)
+
+      0.upto(str.size + 1) do |start|
+        str.index(str, start).should == str.index(chr, start)
+      end
+
+      (-str.size - 1).upto(-1) do |start|
+        str.index(str, start).should == str.index(chr, start)
+      end
+    end
+  end
+
+  it "returns the index of the first occurrence of the given substring" do
+    "blablabla".index("").should == 0
+    "blablabla".index("b").should == 0
+    "blablabla".index("bla").should == 0
+    "blablabla".index("blabla").should == 0
+    "blablabla".index("blablabla").should == 0
+
+    "blablabla".index("l").should == 1
+    "blablabla".index("la").should == 1
+    "blablabla".index("labla").should == 1
+    "blablabla".index("lablabla").should == 1
+
+    "blablabla".index("a").should == 2
+    "blablabla".index("abla").should == 2
+    "blablabla".index("ablabla").should == 2
+  end
+
+  it "doesn't set $~" do
+    $~ = nil
+
+    'hello.'.index('ll')
+    $~.should == nil
+  end
+
+  it "ignores string subclasses" do
+    "blablabla".index(StringSpecs::MyString.new("bla")).should == 0
+    StringSpecs::MyString.new("blablabla").index("bla").should == 0
+    StringSpecs::MyString.new("blablabla").index(StringSpecs::MyString.new("bla")).should == 0
+  end
+
+  it "starts the search at the given offset" do
+    "blablabla".index("bl", 0).should == 0
+    "blablabla".index("bl", 1).should == 3
+    "blablabla".index("bl", 2).should == 3
+    "blablabla".index("bl", 3).should == 3
+
+    "blablabla".index("bla", 0).should == 0
+    "blablabla".index("bla", 1).should == 3
+    "blablabla".index("bla", 2).should == 3
+    "blablabla".index("bla", 3).should == 3
+
+    "blablabla".index("blab", 0).should == 0
+    "blablabla".index("blab", 1).should == 3
+    "blablabla".index("blab", 2).should == 3
+    "blablabla".index("blab", 3).should == 3
+
+    "blablabla".index("la", 1).should == 1
+    "blablabla".index("la", 2).should == 4
+    "blablabla".index("la", 3).should == 4
+    "blablabla".index("la", 4).should == 4
+
+    "blablabla".index("lab", 1).should == 1
+    "blablabla".index("lab", 2).should == 4
+    "blablabla".index("lab", 3).should == 4
+    "blablabla".index("lab", 4).should == 4
+
+    "blablabla".index("ab", 2).should == 2
+    "blablabla".index("ab", 3).should == 5
+    "blablabla".index("ab", 4).should == 5
+    "blablabla".index("ab", 5).should == 5
+
+    "blablabla".index("", 0).should == 0
+
+    "blablabla".index("", 1).should == 1
+    "blablabla".index("", 2).should == 2
+    "blablabla".index("", 7).should == 7
+    "blablabla".index("", 8).should == 8
+    "blablabla".index("", 9).should == 9
+  end
+
+  it "starts the search at offset + self.length if offset is negative" do
+    str = "blablabla"
+
+    ["bl", "bla", "blab", "la", "lab", "ab", ""].each do |needle|
+      (-str.length .. -1).each do |offset|
+        str.index(needle, offset).should ==
+        str.index(needle, offset + str.length)
+      end
+    end
+  end
+
+  it "returns nil if the substring isn't found" do
+    "blablabla".index("B").should == nil
+    "blablabla".index("z").should == nil
+    "blablabla".index("BLA").should == nil
+    "blablabla".index("blablablabla").should == nil
+    "blablabla".index("", 10).should == nil
+
+    "hello".index("he", 1).should == nil
+    "hello".index("he", 2).should == nil
+    "I’ve got a multibyte character.\n".index("\n\n").should == nil
+  end
+
+  it "returns the character index of a multibyte character" do
+    "ありがとう".index("が").should == 2
+  end
+
+  it "returns the character index after offset" do
+    "われわれ".index("わ", 1).should == 2
+    "ありがとうありがとう".index("が", 3).should == 7
+  end
+
+  it "returns the character index after a partial first match" do
+    "</</h".index("</h").should == 2
+  end
+
+  it "raises an Encoding::CompatibilityError if the encodings are incompatible" do
+    char = "れ".encode Encoding::EUC_JP
+    -> do
+      "あれ".index char
+    end.should raise_error(Encoding::CompatibilityError)
+  end
+
+  it "handles a substring in a superset encoding" do
+    'abc'.dup.force_encoding(Encoding::US_ASCII).index('é').should == nil
+  end
+
+  it "handles a substring in a subset encoding" do
+    'été'.index('t'.dup.force_encoding(Encoding::US_ASCII)).should == 1
+  end
+
+  it "raises an Encoding::CompatibilityError if the encodings are incompatible" do
+    NATFIXME 'Implement ISO-2022-JP', exception: ArgumentError, message: 'unknown encoding name - "ISO-2022-JP"' do
+      str = 'abc'.dup.force_encoding("ISO-2022-JP")
+      pattern = 'b'.dup.force_encoding("EUC-JP")
+
+      -> { str.index(pattern) }.should raise_error(Encoding::CompatibilityError, "incompatible character encodings: ISO-2022-JP and EUC-JP")
+    end
+  end
+end
+
+describe "String#index with Regexp" do
+  it "behaves the same as String#index(string) for escaped string regexps" do
+    ["blablabla", "hello cruel world...!"].each do |str|
+      ["", "b", "bla", "lab", "o c", "d."].each do |needle|
+        regexp = Regexp.new(Regexp.escape(needle))
+        str.index(regexp).should == str.index(needle)
+
+        0.upto(str.size + 1) do |start|
+          str.index(regexp, start).should == str.index(needle, start)
+        end
+
+        (-str.size - 1).upto(-1) do |start|
+          str.index(regexp, start).should == str.index(needle, start)
+        end
+      end
+    end
+  end
+
+  it "returns the index of the first match of regexp" do
+    "blablabla".index(/bla/).should == 0
+    "blablabla".index(/BLA/i).should == 0
+
+    "blablabla".index(/.{0}/).should == 0
+    "blablabla".index(/.{6}/).should == 0
+    "blablabla".index(/.{9}/).should == 0
+
+    "blablabla".index(/.*/).should == 0
+    "blablabla".index(/.+/).should == 0
+
+    "blablabla".index(/lab|b/).should == 0
+
+    not_supported_on :opal do
+      "blablabla".index(/\A/).should == 0
+      "blablabla".index(/\Z/).should == 9
+      "blablabla".index(/\z/).should == 9
+      "blablabla\n".index(/\Z/).should == 9
+      "blablabla\n".index(/\z/).should == 10
+    end
+
+    "blablabla".index(/^/).should == 0
+    "\nblablabla".index(/^/).should == 0
+    "b\nablabla".index(/$/).should == 1
+    "bl\nablabla".index(/$/).should == 2
+
+    "blablabla".index(/.l./).should == 0
+  end
+
+  it "sets $~ to MatchData of match and nil when there's none" do
+    'hello.'.index(/.(.)/)
+    $~[0].should == 'he'
+
+    'hello.'.index(/not/)
+    $~.should == nil
+  end
+
+  ruby_bug "#20421", ""..."3.3" do
+    it "always clear $~" do
+      "a".index(/a/)
+      $~.should_not == nil
+
+      string = "blablabla"
+      string.index(/bla/, string.length + 1)
+      $~.should == nil
+    end
+  end
+
+  it "starts the search at the given offset" do
+    "blablabla".index(/.{0}/, 5).should == 5
+    "blablabla".index(/.{1}/, 5).should == 5
+    "blablabla".index(/.{2}/, 5).should == 5
+    "blablabla".index(/.{3}/, 5).should == 5
+    "blablabla".index(/.{4}/, 5).should == 5
+
+    "blablabla".index(/.{0}/, 3).should == 3
+    "blablabla".index(/.{1}/, 3).should == 3
+    "blablabla".index(/.{2}/, 3).should == 3
+    "blablabla".index(/.{5}/, 3).should == 3
+    "blablabla".index(/.{6}/, 3).should == 3
+
+    "blablabla".index(/.l./, 0).should == 0
+    "blablabla".index(/.l./, 1).should == 3
+    "blablabla".index(/.l./, 2).should == 3
+    "blablabla".index(/.l./, 3).should == 3
+
+    "xblaxbla".index(/x./, 0).should == 0
+    "xblaxbla".index(/x./, 1).should == 4
+    "xblaxbla".index(/x./, 2).should == 4
+
+    not_supported_on :opal do
+      "blablabla\n".index(/\Z/, 9).should == 9
+    end
+  end
+
+  it "starts the search at offset + self.length if offset is negative" do
+    str = "blablabla"
+
+    ["bl", "bla", "blab", "la", "lab", "ab", ""].each do |needle|
+      (-str.length .. -1).each do |offset|
+        str.index(needle, offset).should ==
+        str.index(needle, offset + str.length)
+      end
+    end
+  end
+
+  it "returns nil if the substring isn't found" do
+    "blablabla".index(/BLA/).should == nil
+
+    "blablabla".index(/.{10}/).should == nil
+    "blaxbla".index(/.x/, 3).should == nil
+    "blaxbla".index(/..x/, 2).should == nil
+  end
+
+  it "returns nil if the Regexp matches the empty string and the offset is out of range" do
+    "ruby".index(//,12).should be_nil
+  end
+
+  it "supports \\G which matches at the given start offset" do
+    "helloYOU.".index(/\GYOU/, 5).should == 5
+    "helloYOU.".index(/\GYOU/).should == nil
+
+    re = /\G.+YOU/
+    # The # marks where \G will match.
+    [
+      ["#hi!YOUall.", 0],
+      ["h#i!YOUall.", 1],
+      ["hi#!YOUall.", 2],
+      ["hi!#YOUall.", nil]
+    ].each do |spec|
+
+      start = spec[0].index("#")
+      str = spec[0].delete("#")
+
+      str.index(re, start).should == spec[1]
+    end
+  end
+
+  it "converts start_offset to an integer via to_int" do
+    obj = mock('1')
+    obj.should_receive(:to_int).and_return(1)
+    "RWOARW".index(/R./, obj).should == 4
+  end
+
+  it "returns the character index of a multibyte character" do
+    "ありがとう".index(/が/).should == 2
+  end
+
+  it "returns the character index after offset" do
+    "われわれ".index(/わ/, 1).should == 2
+  end
+
+  it "treats the offset as a character index" do
+    "われわわれ".index(/わ/, 3).should == 3
+  end
+
+  ruby_bug "#19763", ""..."3.3.0" do
+    it "raises an Encoding::CompatibilityError if the encodings are incompatible" do
+      re = Regexp.new "れ".encode(Encoding::EUC_JP)
+      -> do
+        "あれ".index re
+      end.should raise_error(Encoding::CompatibilityError, "incompatible encoding regexp match (EUC-JP regexp with UTF-8 string)")
+    end
+  end
+
+  # The exception message was incorrectly "incompatible character encodings: UTF-8 and EUC-JP" before 3.3.0
+  # Still test that the right exception class is used before that.
+  it "raises an Encoding::CompatibilityError if the encodings are incompatible" do
+    re = Regexp.new "れ".encode(Encoding::EUC_JP)
+    NATFIXME 'Implement real encodings on Regexp', exception: SpecFailedException, message: /Encoding::CompatibilityError/ do
+      -> do
+        "あれ".index re
+      end.should raise_error(Encoding::CompatibilityError)
+    end
+  end
+end
diff --git a/spec/core/string/match_spec.rb b/spec/core/string/match_spec.rb
index d0ff6a1d6..339ca0e60 100644
--- a/spec/core/string/match_spec.rb
+++ b/spec/core/string/match_spec.rb
@@ -12,12 +12,10 @@
 
 describe "String#=~" do
   it "behaves the same way as index() when given a regexp" do
-    NATFIXME 'Implement Regexp argument in String#index', exception: TypeError, message: 'no implicit conversion of Regexp into String' do
-      ("rudder" =~ /udder/).should == "rudder".index(/udder/)
-      ("boat" =~ /[^fl]oat/).should == "boat".index(/[^fl]oat/)
-      ("bean" =~ /bag/).should == "bean".index(/bag/)
-      ("true" =~ /false/).should == "true".index(/false/)
-    end
+    ("rudder" =~ /udder/).should == "rudder".index(/udder/)
+    ("boat" =~ /[^fl]oat/).should == "boat".index(/[^fl]oat/)
+    ("bean" =~ /bag/).should == "bean".index(/bag/)
+    ("true" =~ /false/).should == "true".index(/false/)
   end
 
   it "raises a TypeError if a obj is a string" do
diff --git a/spec/shared/string/end_with.rb b/spec/shared/string/end_with.rb
index 2233a864b..c88980930 100644
--- a/spec/shared/string/end_with.rb
+++ b/spec/shared/string/end_with.rb
@@ -48,11 +48,9 @@
 
   it "raises an Encoding::CompatibilityError if the encodings are incompatible" do
     pat = "ア".encode Encoding::EUC_JP
-    NATFIXME 'Raise Encoding::CompatibilityError', exception: SpecFailedException do
-      -> do
-        "あれ".send(@method).end_with?(pat)
-      end.should raise_error(Encoding::CompatibilityError)
-    end
+    -> do
+      "あれ".send(@method).end_with?(pat)
+    end.should raise_error(Encoding::CompatibilityError)
   end
 
   it "checks that we are starting to match at the head of a character" do
diff --git a/src/regexp_object.cpp b/src/regexp_object.cpp
index 9b8ed294c..1e7208d4e 100644
--- a/src/regexp_object.cpp
+++ b/src/regexp_object.cpp
@@ -403,6 +403,7 @@ Value RegexpObject::match(Env *env, Value other, Value start, Block *block) {
         start_byte_index = start->as_integer_or_raise(env)->to_nat_int_t();
 
         if (start_byte_index < 0) {
+            // FIXME: move this logic to StringObject::char_index_to_byte_index
             size_t byte_index = str_obj->bytesize();
             ssize_t char_index = 0;
             TM::StringView view;
@@ -412,6 +413,7 @@ Value RegexpObject::match(Env *env, Value other, Value start, Block *block) {
             } while (byte_index != 0 && start_byte_index < char_index);
             start_byte_index = byte_index;
         } else {
+            // FIXME: use StringObject::char_index_to_byte_index
             size_t byte_index = 0;
             ssize_t char_index = 0;
             TM::StringView view;
diff --git a/src/string_object.cpp b/src/string_object.cpp
index b4f80e65a..37bfc3697 100644
--- a/src/string_object.cpp
+++ b/src/string_object.cpp
@@ -635,7 +635,7 @@ bool StringObject::end_with(Env *env, Args args) const {
     return false;
 }
 
-Value StringObject::index(Env *env, Value needle, Value offset) const {
+Value StringObject::index(Env *env, Value needle, Value offset) {
     int offset_i = (offset) ? IntegerObject::convert_to_int(env, offset) : 0;
     int len = char_count(env);
     if (offset_i < -1 * len) {
@@ -648,29 +648,45 @@ Value StringObject::index(Env *env, Value needle, Value offset) const {
     return index(env, needle, ::abs(offset_i));
 }
 
-// TODO: Handle Regexp needle case
-Value StringObject::index(Env *env, Value needle, size_t start) const {
-    auto byte_index = index_int(env, needle, start);
-    if (byte_index == -1) {
+Value StringObject::index(Env *env, Value needle, size_t start) {
+    auto byte_start = char_index_to_byte_index(start);
+    if (byte_start == -1)
         return NilObject::the();
-    }
-    size_t byte_index_size_t = static_cast<size_t>(byte_index);
-    size_t char_index = 0, index = 0;
-    auto c = next_char(&index);
-    while (!c.is_empty()) {
-        if (index > byte_index_size_t)
-            return IntegerObject::from_size_t(env, char_index);
-        char_index++;
-        c = next_char(&index);
-    }
-    return Value::integer(0);
+    auto byte_index = index_int(env, needle, byte_start);
+    if (byte_index == -1)
+        return NilObject::the();
+    auto char_index = byte_index_to_char_index(byte_index);
+    return IntegerObject::from_size_t(env, char_index);
 }
 
-nat_int_t StringObject::index_int(Env *env, Value needle, size_t byte_start) const {
+nat_int_t StringObject::index_int(Env *env, Value needle, size_t byte_start) {
+    if (needle->is_regexp()) {
+        if (needle->as_regexp()->pattern().is_empty())
+            return byte_start;
+
+        if (bytesize() == 0)
+            return -1;
+
+        if (byte_start >= bytesize())
+            return -1;
+
+        OnigRegion *region = onig_region_new();
+        int result = needle->as_regexp()->search(string(), byte_start, region, ONIG_OPTION_NONE);
+        if (result == ONIG_MISMATCH) {
+            env->caller()->set_last_match(nullptr);
+            return -1;
+        }
+
+        auto match = new MatchDataObject { region, this, needle->as_regexp() };
+        env->caller()->set_last_match(match);
+        return region->beg[0];
+    }
+
     auto needle_str = needle->to_str(env)->as_string();
+    assert_compatible_string(env, needle_str);
 
     if (needle_str->bytesize() == 0)
-        return 0;
+        return byte_start;
 
     if (bytesize() == 0)
         return -1;
@@ -1847,7 +1863,7 @@ size_t StringObject::byte_index_to_char_index(ArrayObject *chars, size_t byte_in
     return char_index;
 }
 
-size_t StringObject::char_index_to_byte_index(size_t char_index) const {
+ssize_t StringObject::char_index_to_byte_index(size_t char_index) const {
     if (m_encoding->is_single_byte_encoding())
         return char_index;
 
@@ -1857,12 +1873,12 @@ size_t StringObject::char_index_to_byte_index(size_t char_index) const {
     while (char_index > current_char_index) {
         view = next_char(&current_byte_index);
         current_char_index++;
-        if (view.is_empty()) break;
+        if (view.is_empty()) return -1;
     }
     return current_byte_index;
 }
 
-size_t StringObject::byte_index_to_char_index(size_t byte_index) const {
+ssize_t StringObject::byte_index_to_char_index(size_t byte_index) const {
     if (m_encoding->is_single_byte_encoding())
         return byte_index;
 
@@ -1872,7 +1888,7 @@ size_t StringObject::byte_index_to_char_index(size_t byte_index) const {
     while (byte_index > current_byte_index) {
         view = next_char(&current_byte_index);
         current_char_index++;
-        if (view.is_empty()) break;
+        if (view.is_empty()) return -1;
     }
     return current_char_index;
 }