From 973d312d669aafc7398997457af1faca1df3ea22 Mon Sep 17 00:00:00 2001 From: Tim Morgan Date: Sat, 29 Jun 2024 11:35:23 -0500 Subject: [PATCH 1/3] Refactor String::byteindex to separate the string needle logic This will make it easier to add a regexp case. --- include/natalie/string_object.hpp | 4 +-- spec/core/string/byteindex_spec.rb | 4 +-- src/string_object.cpp | 42 +++++++++++++++++------------- 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/include/natalie/string_object.hpp b/include/natalie/string_object.hpp index 72331e648..d7c4c494a 100644 --- a/include/natalie/string_object.hpp +++ b/include/natalie/string_object.hpp @@ -168,8 +168,8 @@ class StringObject : public Object { EncodingObject *encoding() const { return m_encoding.ptr(); } void set_encoding(EncodingObject *encoding) { m_encoding = encoding; } bool is_ascii_only() const; - EncodingObject *negotiate_compatible_encoding(StringObject *) const; - void assert_compatible_string(Env *, StringObject *) const; + EncodingObject *negotiate_compatible_encoding(const StringObject *) const; + void assert_compatible_string(Env *, const StringObject *) const; void assert_valid_encoding(Env *) const; EncodingObject *assert_compatible_string_and_update_encoding(Env *, StringObject *); diff --git a/spec/core/string/byteindex_spec.rb b/spec/core/string/byteindex_spec.rb index 097699104..2b64e2aad 100644 --- a/spec/core/string/byteindex_spec.rb +++ b/spec/core/string/byteindex_spec.rb @@ -268,9 +268,7 @@ end it "returns nil if the Regexp matches the empty string and the offset is out of range" do - NATFIXME 'Support Regexp', exception: TypeError do - "ruby".byteindex(//, 12).should be_nil - end + "ruby".byteindex(//, 12).should be_nil end it "supports \\G which matches at the given start offset" do diff --git a/src/string_object.cpp b/src/string_object.cpp index 20f9fbbf4..27002d1da 100644 --- a/src/string_object.cpp +++ b/src/string_object.cpp @@ -667,22 +667,15 @@ bool StringObject::end_with(Env *env, Args args) const { return false; } -Value StringObject::byteindex(Env *env, Value needle_obj, Value offset_obj) const { - StringObject *needle_str = needle_obj->to_str2(env); - String needle = needle_str->string(); - - assert_compatible_string(env, needle_str); +static Value byteindex_string_needle(Env *env, const StringObject *haystack, const StringObject *needle_obj, size_t offset) { + haystack->assert_compatible_string(env, needle_obj); + String needle = needle_obj->string(); - ssize_t offset = 0; - if (offset_obj) - offset = IntegerObject::convert_to_native_type(env, offset_obj); - if (offset < 0) - offset += bytesize(); - if (offset < 0 || (size_t)offset + needle.size() > bytesize()) + if ((size_t)offset + needle.size() > haystack->bytesize()) return NilObject::the(); - if ((size_t)offset < bytesize()) { - auto character_check = new StringObject { m_string.substring(offset, std::min(bytesize() - offset, (size_t)4)) }; + if ((size_t)offset < haystack->bytesize()) { + auto character_check = new StringObject { haystack->string().substring(offset, std::min(haystack->bytesize() - offset, (size_t)4)) }; size_t ignored = 0; auto [valid, _char] = character_check->next_char_result(&ignored); if (!valid) @@ -692,17 +685,30 @@ Value StringObject::byteindex(Env *env, Value needle_obj, Value offset_obj) cons if (needle.is_empty()) return Value::integer(offset); - if ((size_t)offset >= bytesize()) + if ((size_t)offset >= haystack->bytesize()) return NilObject::the(); - auto pointer = memmem(c_str() + offset, bytesize() - offset, needle.c_str(), needle.size()); + auto pointer = memmem(haystack->c_str() + offset, haystack->bytesize() - offset, needle.c_str(), needle.size()); if (!pointer) return NilObject::the(); - auto result = (const char *)pointer - c_str(); + auto result = (const char *)pointer - haystack->c_str(); return Value::integer(result); } +Value StringObject::byteindex(Env *env, Value needle_obj, Value offset_obj) const { + ssize_t offset = 0; + if (offset_obj) + offset = IntegerObject::convert_to_native_type(env, offset_obj); + if (offset < 0) + offset += bytesize(); + if (offset < 0 || (size_t)offset > bytesize()) + return NilObject::the(); + + auto needle = needle_obj->to_str2(env); + return byteindex_string_needle(env, this, needle, offset); +} + Value StringObject::index(Env *env, Value needle, Value offset) { int offset_i = (offset) ? IntegerObject::convert_to_int(env, offset) : 0; int len = char_count(env); @@ -3320,7 +3326,7 @@ bool StringObject::is_ascii_only() const { return true; } -EncodingObject *StringObject::negotiate_compatible_encoding(StringObject *other_string) const { +EncodingObject *StringObject::negotiate_compatible_encoding(const StringObject *other_string) const { if (m_encoding == other_string->m_encoding) return m_encoding.ptr(); @@ -3343,7 +3349,7 @@ EncodingObject *StringObject::negotiate_compatible_encoding(StringObject *other_ return m_encoding.ptr(); } -void StringObject::assert_compatible_string(Env *env, StringObject *other_string) const { +void StringObject::assert_compatible_string(Env *env, const StringObject *other_string) const { auto compatible_encoding = negotiate_compatible_encoding(other_string); if (compatible_encoding) return; From 5f7197530866431fdaf82bc404278a9577f1a29c Mon Sep 17 00:00:00 2001 From: Tim Morgan Date: Sat, 29 Jun 2024 12:55:25 -0500 Subject: [PATCH 2/3] Copy string when building MatchDataObject Given a script like this: s = "foo" r = /foo/s.byteindex(r) puts " s.object_id = #{s.object_id}" puts "$~.string.object_id = #{$~.string.object_id}" puts " r.object_id = #{r.object_id}" puts "$~.regexp.object_id = #{$~.regexp.object_id}" The results look like this: s.object_id = 60 $~.string.object_id = 80 r.object_id = 100 $~.regexp.object_id = 100 So that means Ruby copies the string. I don't know what happens if the string is frozen, but we can punt that until later. --- include/natalie/match_data_object.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/natalie/match_data_object.hpp b/include/natalie/match_data_object.hpp index 784b16648..181152936 100644 --- a/include/natalie/match_data_object.hpp +++ b/include/natalie/match_data_object.hpp @@ -20,10 +20,10 @@ class MatchDataObject : public Object { MatchDataObject(ClassObject *klass) : Object { Object::Type::MatchData, klass } { } - MatchDataObject(OnigRegion *region, StringObject *string, RegexpObject *regexp) + MatchDataObject(OnigRegion *region, const StringObject *string, RegexpObject *regexp) : Object { Object::Type::MatchData, GlobalEnv::the()->Object()->const_fetch("MatchData"_s)->as_class() } , m_region { region } - , m_string { string } + , m_string { new StringObject(*string) } , m_regexp { regexp } { } virtual ~MatchDataObject() override { From 61440c51dfa031a48b0c8ab6e04abcea07ee8b68 Mon Sep 17 00:00:00 2001 From: Tim Morgan Date: Sat, 29 Jun 2024 13:09:29 -0500 Subject: [PATCH 3/3] Add support for Regexp argument to String#byteindex --- include/natalie/encoding_object.hpp | 3 +- spec/core/string/byteindex_spec.rb | 178 +++++++++---------- spec/core/string/shared/byte_index_common.rb | 20 +-- src/encoding_object.cpp | 5 + src/regexp_object.cpp | 1 + src/string_object.cpp | 33 +++- 6 files changed, 126 insertions(+), 114 deletions(-) diff --git a/include/natalie/encoding_object.hpp b/include/natalie/encoding_object.hpp index 6e988af39..1692491d5 100644 --- a/include/natalie/encoding_object.hpp +++ b/include/natalie/encoding_object.hpp @@ -125,7 +125,8 @@ class EncodingObject : public Object { virtual bool is_compatible_with(EncodingObject *) const; - [[noreturn]] void raise_encoding_invalid_byte_sequence_error(Env *env, const String &, size_t) const; + [[noreturn]] void raise_encoding_invalid_byte_sequence_error(Env *, const String &, size_t) const; + [[noreturn]] void raise_compatibility_error(Env *, const EncodingObject *) const; static HashObject *aliases(Env *); static Value find(Env *, Value); diff --git a/spec/core/string/byteindex_spec.rb b/spec/core/string/byteindex_spec.rb index 2b64e2aad..47c7be102 100644 --- a/spec/core/string/byteindex_spec.rb +++ b/spec/core/string/byteindex_spec.rb @@ -168,81 +168,75 @@ describe "String#byteindex with Regexp" do ruby_version_is "3.2" do it "behaves the same as String#byteindex(string) for escaped string regexps" do - NATFIXME 'Support Regexp', exception: TypeError do - ["blablabla", "hello cruel world...!"].each do |str| - ["", "b", "bla", "lab", "o c", "d."].each do |needle| - regexp = Regexp.new(Regexp.escape(needle)) - str.byteindex(regexp).should == str.byteindex(needle) - - 0.upto(str.size + 1) do |start| - str.byteindex(regexp, start).should == str.byteindex(needle, start) - end - - (-str.size - 1).upto(-1) do |start| - str.byteindex(regexp, start).should == str.byteindex(needle, start) - end + ["blablabla", "hello cruel world...!"].each do |str| + ["", "b", "bla", "lab", "o c", "d."].each do |needle| + regexp = Regexp.new(Regexp.escape(needle)) + str.byteindex(regexp).should == str.byteindex(needle) + + 0.upto(str.size + 1) do |start| + str.byteindex(regexp, start).should == str.byteindex(needle, start) + end + + (-str.size - 1).upto(-1) do |start| + str.byteindex(regexp, start).should == str.byteindex(needle, start) end end end end it "returns the byteindex of the first match of regexp" do - NATFIXME 'Support Regexp', exception: TypeError do - "blablabla".byteindex(/bla/).should == 0 - "blablabla".byteindex(/BLA/i).should == 0 + "blablabla".byteindex(/bla/).should == 0 + "blablabla".byteindex(/BLA/i).should == 0 - "blablabla".byteindex(/.{0}/).should == 0 - "blablabla".byteindex(/.{6}/).should == 0 - "blablabla".byteindex(/.{9}/).should == 0 + "blablabla".byteindex(/.{0}/).should == 0 + "blablabla".byteindex(/.{6}/).should == 0 + "blablabla".byteindex(/.{9}/).should == 0 - "blablabla".byteindex(/.*/).should == 0 - "blablabla".byteindex(/.+/).should == 0 + "blablabla".byteindex(/.*/).should == 0 + "blablabla".byteindex(/.+/).should == 0 - "blablabla".byteindex(/lab|b/).should == 0 + "blablabla".byteindex(/lab|b/).should == 0 - not_supported_on :opal do - "blablabla".byteindex(/\A/).should == 0 - "blablabla".byteindex(/\Z/).should == 9 - "blablabla".byteindex(/\z/).should == 9 - "blablabla\n".byteindex(/\Z/).should == 9 - "blablabla\n".byteindex(/\z/).should == 10 - end + not_supported_on :opal do + "blablabla".byteindex(/\A/).should == 0 + "blablabla".byteindex(/\Z/).should == 9 + "blablabla".byteindex(/\z/).should == 9 + "blablabla\n".byteindex(/\Z/).should == 9 + "blablabla\n".byteindex(/\z/).should == 10 + end - "blablabla".byteindex(/^/).should == 0 - "\nblablabla".byteindex(/^/).should == 0 - "b\nablabla".byteindex(/$/).should == 1 - "bl\nablabla".byteindex(/$/).should == 2 + "blablabla".byteindex(/^/).should == 0 + "\nblablabla".byteindex(/^/).should == 0 + "b\nablabla".byteindex(/$/).should == 1 + "bl\nablabla".byteindex(/$/).should == 2 - "blablabla".byteindex(/.l./).should == 0 - end + "blablabla".byteindex(/.l./).should == 0 end it "starts the search at the given offset" do - NATFIXME 'Support Regexp', exception: TypeError do - "blablabla".byteindex(/.{0}/, 5).should == 5 - "blablabla".byteindex(/.{1}/, 5).should == 5 - "blablabla".byteindex(/.{2}/, 5).should == 5 - "blablabla".byteindex(/.{3}/, 5).should == 5 - "blablabla".byteindex(/.{4}/, 5).should == 5 - - "blablabla".byteindex(/.{0}/, 3).should == 3 - "blablabla".byteindex(/.{1}/, 3).should == 3 - "blablabla".byteindex(/.{2}/, 3).should == 3 - "blablabla".byteindex(/.{5}/, 3).should == 3 - "blablabla".byteindex(/.{6}/, 3).should == 3 - - "blablabla".byteindex(/.l./, 0).should == 0 - "blablabla".byteindex(/.l./, 1).should == 3 - "blablabla".byteindex(/.l./, 2).should == 3 - "blablabla".byteindex(/.l./, 3).should == 3 - - "xblaxbla".byteindex(/x./, 0).should == 0 - "xblaxbla".byteindex(/x./, 1).should == 4 - "xblaxbla".byteindex(/x./, 2).should == 4 - - not_supported_on :opal do - "blablabla\n".byteindex(/\Z/, 9).should == 9 - end + "blablabla".byteindex(/.{0}/, 5).should == 5 + "blablabla".byteindex(/.{1}/, 5).should == 5 + "blablabla".byteindex(/.{2}/, 5).should == 5 + "blablabla".byteindex(/.{3}/, 5).should == 5 + "blablabla".byteindex(/.{4}/, 5).should == 5 + + "blablabla".byteindex(/.{0}/, 3).should == 3 + "blablabla".byteindex(/.{1}/, 3).should == 3 + "blablabla".byteindex(/.{2}/, 3).should == 3 + "blablabla".byteindex(/.{5}/, 3).should == 3 + "blablabla".byteindex(/.{6}/, 3).should == 3 + + "blablabla".byteindex(/.l./, 0).should == 0 + "blablabla".byteindex(/.l./, 1).should == 3 + "blablabla".byteindex(/.l./, 2).should == 3 + "blablabla".byteindex(/.l./, 3).should == 3 + + "xblaxbla".byteindex(/x./, 0).should == 0 + "xblaxbla".byteindex(/x./, 1).should == 4 + "xblaxbla".byteindex(/x./, 2).should == 4 + + not_supported_on :opal do + "blablabla\n".byteindex(/\Z/, 9).should == 9 end end @@ -258,13 +252,11 @@ end it "returns nil if the substring isn't found" do - NATFIXME 'Support Regexp', exception: TypeError do - "blablabla".byteindex(/BLA/).should == nil + "blablabla".byteindex(/BLA/).should == nil - "blablabla".byteindex(/.{10}/).should == nil - "blaxbla".byteindex(/.x/, 3).should == nil - "blaxbla".byteindex(/..x/, 2).should == nil - end + "blablabla".byteindex(/.{10}/).should == nil + "blaxbla".byteindex(/.x/, 3).should == nil + "blaxbla".byteindex(/..x/, 2).should == nil end it "returns nil if the Regexp matches the empty string and the offset is out of range" do @@ -272,51 +264,41 @@ end it "supports \\G which matches at the given start offset" do - NATFIXME 'Support Regexp', exception: TypeError do - "helloYOU.".byteindex(/\GYOU/, 5).should == 5 - "helloYOU.".byteindex(/\GYOU/).should == nil - - re = /\G.+YOU/ - # The # marks where \G will match. - [ - ["#hi!YOUall.", 0], - ["h#i!YOUall.", 1], - ["hi#!YOUall.", 2], - ["hi!#YOUall.", nil] - ].each do |spec| - - start = spec[0].byteindex("#") - str = spec[0].delete("#") - - str.byteindex(re, start).should == spec[1] - end + "helloYOU.".byteindex(/\GYOU/, 5).should == 5 + "helloYOU.".byteindex(/\GYOU/).should == nil + + re = /\G.+YOU/ + # The # marks where \G will match. + [ + ["#hi!YOUall.", 0], + ["h#i!YOUall.", 1], + ["hi#!YOUall.", 2], + ["hi!#YOUall.", nil] + ].each do |spec| + + start = spec[0].byteindex("#") + str = spec[0].delete("#") + + str.byteindex(re, start).should == spec[1] end end it "converts start_offset to an integer via to_int" do - NATFIXME 'Support Regexp', exception: TypeError do - obj = mock('1') - obj.should_receive(:to_int).and_return(1) - "RWOARW".byteindex(/R./, obj).should == 4 - end + obj = mock('1') + obj.should_receive(:to_int).and_return(1) + "RWOARW".byteindex(/R./, obj).should == 4 end it "returns the character byteindex of a multibyte character" do - NATFIXME 'Support Regexp', exception: TypeError do - "ありがとう".byteindex(/が/).should == 6 - end + "ありがとう".byteindex(/が/).should == 6 end it "returns the character byteindex after offset" do - NATFIXME 'Support Regexp', exception: TypeError do - "われわれ".byteindex(/わ/, 3).should == 6 - end + "われわれ".byteindex(/わ/, 3).should == 6 end it "treats the offset as a byteindex" do - NATFIXME 'Support Regexp', exception: TypeError do - "われわわれ".byteindex(/わ/, 6).should == 6 - end + "われわわれ".byteindex(/わ/, 6).should == 6 end end end diff --git a/spec/core/string/shared/byte_index_common.rb b/spec/core/string/shared/byte_index_common.rb index 67c27dc1a..3de1453f4 100644 --- a/spec/core/string/shared/byte_index_common.rb +++ b/spec/core/string/shared/byte_index_common.rb @@ -37,12 +37,10 @@ end it "raises an Encoding::CompatibilityError if the encodings are incompatible" do - NATFIXME 'Support Regexp', exception: SpecFailedException, message: /should have raised Encoding::CompatibilityError/ do - re = Regexp.new "れ".encode(Encoding::EUC_JP) - -> do - "あれ".send(@method, re) - end.should raise_error(Encoding::CompatibilityError, "incompatible encoding regexp match (EUC-JP regexp with UTF-8 string)") - end + re = Regexp.new "れ".encode(Encoding::EUC_JP) + -> do + "あれ".send(@method, re) + end.should raise_error(Encoding::CompatibilityError, "incompatible encoding regexp match (EUC-JP regexp with UTF-8 string)") end end @@ -55,13 +53,11 @@ end it "sets $~ to MatchData of match and nil when there's none" do - NATFIXME 'Support Regexp', exception: TypeError do - 'hello.'.send(@method, /.e./) - $~[0].should == 'hel' + 'hello.'.send(@method, /.e./) + $~[0].should == 'hel' - 'hello.'.send(@method, /not/) - $~.should == nil - end + 'hello.'.send(@method, /not/) + $~.should == nil end end end diff --git a/src/encoding_object.cpp b/src/encoding_object.cpp index 62f920827..ab28e7c50 100644 --- a/src/encoding_object.cpp +++ b/src/encoding_object.cpp @@ -368,6 +368,11 @@ void EncodingObject::raise_encoding_invalid_byte_sequence_error(Env *env, const env->raise(InvalidByteSequenceError, message); } +void EncodingObject::raise_compatibility_error(Env *env, const EncodingObject *other_encoding) const { + auto exception_class = fetch_nested_const({ "Encoding"_s, "CompatibilityError"_s })->as_class(); + env->raise(exception_class, "incompatible character encodings: {} and {}", name()->string(), other_encoding->name()->string()); +} + Value EncodingObject::inspect(Env *env) const { if (is_dummy()) return StringObject::format("#", name()); diff --git a/src/regexp_object.cpp b/src/regexp_object.cpp index 9ce8a9251..0d2672c3e 100644 --- a/src/regexp_object.cpp +++ b/src/regexp_object.cpp @@ -545,6 +545,7 @@ int RegexpObject::search(Env *env, const StringObject *string_obj, int start, On const unsigned char *char_start = unsigned_str + start; const unsigned char *char_range = char_end; + // FIXME: check if it's already FIXEDENCODING if (string_obj->encoding() != encoding()) { RegexpObject temp_regexp; temp_regexp.initialize_internal(env, m_pattern, m_options | RegexOpts::FixedEncoding); diff --git a/src/string_object.cpp b/src/string_object.cpp index 27002d1da..0e1b68c1e 100644 --- a/src/string_object.cpp +++ b/src/string_object.cpp @@ -667,7 +667,31 @@ bool StringObject::end_with(Env *env, Args args) const { return false; } -static Value byteindex_string_needle(Env *env, const StringObject *haystack, const StringObject *needle_obj, size_t offset) { +static Value byteindex_regexp_needle(Env *env, const StringObject *haystack, RegexpObject *needle, size_t offset) { + if (!haystack->negotiate_compatible_encoding(needle->pattern())) { + auto exception_class = fetch_nested_const({ "Encoding"_s, "CompatibilityError"_s })->as_class(); + auto enc1 = needle->pattern()->encoding()->name()->string(); + auto enc2 = haystack->encoding()->name()->string(); + env->raise(exception_class, "incompatible encoding regexp match ({} regexp with {} string)", enc1, enc2); + } + + if (needle->pattern()->is_empty()) + return Value::integer(offset); + + OnigRegion *region = onig_region_new(); + int result = needle->search(env, haystack, offset, region, ONIG_OPTION_NONE); + if (result == ONIG_MISMATCH) { + env->caller()->set_last_match(nullptr); + return NilObject::the(); + } + + auto match = new MatchDataObject { region, haystack, needle }; + env->caller()->set_last_match(match); + auto byte_index = region->beg[0]; + return Value::integer(byte_index); +} + +static Value byteindex_string_needle(Env *env, const StringObject *haystack, StringObject *needle_obj, size_t offset) { haystack->assert_compatible_string(env, needle_obj); String needle = needle_obj->string(); @@ -705,6 +729,9 @@ Value StringObject::byteindex(Env *env, Value needle_obj, Value offset_obj) cons if (offset < 0 || (size_t)offset > bytesize()) return NilObject::the(); + if (needle_obj->is_regexp()) + return byteindex_regexp_needle(env, this, needle_obj->as_regexp(), offset); + auto needle = needle_obj->to_str2(env); return byteindex_string_needle(env, this, needle, offset); } @@ -735,6 +762,7 @@ Value StringObject::index(Env *env, Value needle, size_t start) { nat_int_t StringObject::index_int(Env *env, Value needle, size_t byte_start) { if (needle->is_regexp()) { + // FIXME: use byteindex_regexp_needle shared code if (needle->as_regexp()->pattern()->is_empty()) return byte_start; @@ -3354,8 +3382,7 @@ void StringObject::assert_compatible_string(Env *env, const StringObject *other_ if (compatible_encoding) return; - auto exception_class = fetch_nested_const({ "Encoding"_s, "CompatibilityError"_s })->as_class(); - env->raise(exception_class, "incompatible character encodings: {} and {}", m_encoding->name()->string(), other_string->m_encoding->name()->string()); + m_encoding->raise_compatibility_error(env, other_string->m_encoding.ptr()); } void StringObject::assert_valid_encoding(Env *env) const {