Skip to content

Commit

Permalink
Merge pull request #2182 from natalie-lang/string-byteindex-regexp
Browse files Browse the repository at this point in the history
Add support for Regexp argument to String#byteindex
  • Loading branch information
seven1m authored Jul 4, 2024
2 parents eb472ee + 61440c5 commit 63cec98
Show file tree
Hide file tree
Showing 8 changed files with 153 additions and 137 deletions.
3 changes: 2 additions & 1 deletion include/natalie/encoding_object.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,8 @@ class EncodingObject : public Object {

virtual bool is_compatible_with(EncodingObject *) const;

[[noreturn]] void raise_encoding_invalid_byte_sequence_error(Env *env, const String &, size_t) const;
[[noreturn]] void raise_encoding_invalid_byte_sequence_error(Env *, const String &, size_t) const;
[[noreturn]] void raise_compatibility_error(Env *, const EncodingObject *) const;

static HashObject *aliases(Env *);
static Value find(Env *, Value);
Expand Down
4 changes: 2 additions & 2 deletions include/natalie/match_data_object.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ class MatchDataObject : public Object {
MatchDataObject(ClassObject *klass)
: Object { Object::Type::MatchData, klass } { }

MatchDataObject(OnigRegion *region, StringObject *string, RegexpObject *regexp)
MatchDataObject(OnigRegion *region, const StringObject *string, RegexpObject *regexp)
: Object { Object::Type::MatchData, GlobalEnv::the()->Object()->const_fetch("MatchData"_s)->as_class() }
, m_region { region }
, m_string { string }
, m_string { new StringObject(*string) }
, m_regexp { regexp } { }

virtual ~MatchDataObject() override {
Expand Down
4 changes: 2 additions & 2 deletions include/natalie/string_object.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,8 @@ class StringObject : public Object {
EncodingObject *encoding() const { return m_encoding.ptr(); }
void set_encoding(EncodingObject *encoding) { m_encoding = encoding; }
bool is_ascii_only() const;
EncodingObject *negotiate_compatible_encoding(StringObject *) const;
void assert_compatible_string(Env *, StringObject *) const;
EncodingObject *negotiate_compatible_encoding(const StringObject *) const;
void assert_compatible_string(Env *, const StringObject *) const;
void assert_valid_encoding(Env *) const;
EncodingObject *assert_compatible_string_and_update_encoding(Env *, StringObject *);

Expand Down
182 changes: 81 additions & 101 deletions spec/core/string/byteindex_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -168,81 +168,75 @@
describe "String#byteindex with Regexp" do
ruby_version_is "3.2" do
it "behaves the same as String#byteindex(string) for escaped string regexps" do
NATFIXME 'Support Regexp', exception: TypeError do
["blablabla", "hello cruel world...!"].each do |str|
["", "b", "bla", "lab", "o c", "d."].each do |needle|
regexp = Regexp.new(Regexp.escape(needle))
str.byteindex(regexp).should == str.byteindex(needle)

0.upto(str.size + 1) do |start|
str.byteindex(regexp, start).should == str.byteindex(needle, start)
end

(-str.size - 1).upto(-1) do |start|
str.byteindex(regexp, start).should == str.byteindex(needle, start)
end
["blablabla", "hello cruel world...!"].each do |str|
["", "b", "bla", "lab", "o c", "d."].each do |needle|
regexp = Regexp.new(Regexp.escape(needle))
str.byteindex(regexp).should == str.byteindex(needle)

0.upto(str.size + 1) do |start|
str.byteindex(regexp, start).should == str.byteindex(needle, start)
end

(-str.size - 1).upto(-1) do |start|
str.byteindex(regexp, start).should == str.byteindex(needle, start)
end
end
end
end

it "returns the byteindex of the first match of regexp" do
NATFIXME 'Support Regexp', exception: TypeError do
"blablabla".byteindex(/bla/).should == 0
"blablabla".byteindex(/BLA/i).should == 0
"blablabla".byteindex(/bla/).should == 0
"blablabla".byteindex(/BLA/i).should == 0

"blablabla".byteindex(/.{0}/).should == 0
"blablabla".byteindex(/.{6}/).should == 0
"blablabla".byteindex(/.{9}/).should == 0
"blablabla".byteindex(/.{0}/).should == 0
"blablabla".byteindex(/.{6}/).should == 0
"blablabla".byteindex(/.{9}/).should == 0

"blablabla".byteindex(/.*/).should == 0
"blablabla".byteindex(/.+/).should == 0
"blablabla".byteindex(/.*/).should == 0
"blablabla".byteindex(/.+/).should == 0

"blablabla".byteindex(/lab|b/).should == 0
"blablabla".byteindex(/lab|b/).should == 0

not_supported_on :opal do
"blablabla".byteindex(/\A/).should == 0
"blablabla".byteindex(/\Z/).should == 9
"blablabla".byteindex(/\z/).should == 9
"blablabla\n".byteindex(/\Z/).should == 9
"blablabla\n".byteindex(/\z/).should == 10
end
not_supported_on :opal do
"blablabla".byteindex(/\A/).should == 0
"blablabla".byteindex(/\Z/).should == 9
"blablabla".byteindex(/\z/).should == 9
"blablabla\n".byteindex(/\Z/).should == 9
"blablabla\n".byteindex(/\z/).should == 10
end

"blablabla".byteindex(/^/).should == 0
"\nblablabla".byteindex(/^/).should == 0
"b\nablabla".byteindex(/$/).should == 1
"bl\nablabla".byteindex(/$/).should == 2
"blablabla".byteindex(/^/).should == 0
"\nblablabla".byteindex(/^/).should == 0
"b\nablabla".byteindex(/$/).should == 1
"bl\nablabla".byteindex(/$/).should == 2

"blablabla".byteindex(/.l./).should == 0
end
"blablabla".byteindex(/.l./).should == 0
end

it "starts the search at the given offset" do
NATFIXME 'Support Regexp', exception: TypeError do
"blablabla".byteindex(/.{0}/, 5).should == 5
"blablabla".byteindex(/.{1}/, 5).should == 5
"blablabla".byteindex(/.{2}/, 5).should == 5
"blablabla".byteindex(/.{3}/, 5).should == 5
"blablabla".byteindex(/.{4}/, 5).should == 5

"blablabla".byteindex(/.{0}/, 3).should == 3
"blablabla".byteindex(/.{1}/, 3).should == 3
"blablabla".byteindex(/.{2}/, 3).should == 3
"blablabla".byteindex(/.{5}/, 3).should == 3
"blablabla".byteindex(/.{6}/, 3).should == 3

"blablabla".byteindex(/.l./, 0).should == 0
"blablabla".byteindex(/.l./, 1).should == 3
"blablabla".byteindex(/.l./, 2).should == 3
"blablabla".byteindex(/.l./, 3).should == 3

"xblaxbla".byteindex(/x./, 0).should == 0
"xblaxbla".byteindex(/x./, 1).should == 4
"xblaxbla".byteindex(/x./, 2).should == 4

not_supported_on :opal do
"blablabla\n".byteindex(/\Z/, 9).should == 9
end
"blablabla".byteindex(/.{0}/, 5).should == 5
"blablabla".byteindex(/.{1}/, 5).should == 5
"blablabla".byteindex(/.{2}/, 5).should == 5
"blablabla".byteindex(/.{3}/, 5).should == 5
"blablabla".byteindex(/.{4}/, 5).should == 5

"blablabla".byteindex(/.{0}/, 3).should == 3
"blablabla".byteindex(/.{1}/, 3).should == 3
"blablabla".byteindex(/.{2}/, 3).should == 3
"blablabla".byteindex(/.{5}/, 3).should == 3
"blablabla".byteindex(/.{6}/, 3).should == 3

"blablabla".byteindex(/.l./, 0).should == 0
"blablabla".byteindex(/.l./, 1).should == 3
"blablabla".byteindex(/.l./, 2).should == 3
"blablabla".byteindex(/.l./, 3).should == 3

"xblaxbla".byteindex(/x./, 0).should == 0
"xblaxbla".byteindex(/x./, 1).should == 4
"xblaxbla".byteindex(/x./, 2).should == 4

not_supported_on :opal do
"blablabla\n".byteindex(/\Z/, 9).should == 9
end
end

Expand All @@ -258,67 +252,53 @@
end

it "returns nil if the substring isn't found" do
NATFIXME 'Support Regexp', exception: TypeError do
"blablabla".byteindex(/BLA/).should == nil
"blablabla".byteindex(/BLA/).should == nil

"blablabla".byteindex(/.{10}/).should == nil
"blaxbla".byteindex(/.x/, 3).should == nil
"blaxbla".byteindex(/..x/, 2).should == nil
end
"blablabla".byteindex(/.{10}/).should == nil
"blaxbla".byteindex(/.x/, 3).should == nil
"blaxbla".byteindex(/..x/, 2).should == nil
end

it "returns nil if the Regexp matches the empty string and the offset is out of range" do
NATFIXME 'Support Regexp', exception: TypeError do
"ruby".byteindex(//, 12).should be_nil
end
"ruby".byteindex(//, 12).should be_nil
end

it "supports \\G which matches at the given start offset" do
NATFIXME 'Support Regexp', exception: TypeError do
"helloYOU.".byteindex(/\GYOU/, 5).should == 5
"helloYOU.".byteindex(/\GYOU/).should == nil

re = /\G.+YOU/
# The # marks where \G will match.
[
["#hi!YOUall.", 0],
["h#i!YOUall.", 1],
["hi#!YOUall.", 2],
["hi!#YOUall.", nil]
].each do |spec|

start = spec[0].byteindex("#")
str = spec[0].delete("#")

str.byteindex(re, start).should == spec[1]
end
"helloYOU.".byteindex(/\GYOU/, 5).should == 5
"helloYOU.".byteindex(/\GYOU/).should == nil

re = /\G.+YOU/
# The # marks where \G will match.
[
["#hi!YOUall.", 0],
["h#i!YOUall.", 1],
["hi#!YOUall.", 2],
["hi!#YOUall.", nil]
].each do |spec|

start = spec[0].byteindex("#")
str = spec[0].delete("#")

str.byteindex(re, start).should == spec[1]
end
end

it "converts start_offset to an integer via to_int" do
NATFIXME 'Support Regexp', exception: TypeError do
obj = mock('1')
obj.should_receive(:to_int).and_return(1)
"RWOARW".byteindex(/R./, obj).should == 4
end
obj = mock('1')
obj.should_receive(:to_int).and_return(1)
"RWOARW".byteindex(/R./, obj).should == 4
end

it "returns the character byteindex of a multibyte character" do
NATFIXME 'Support Regexp', exception: TypeError do
"ありがとう".byteindex(/が/).should == 6
end
"ありがとう".byteindex(/が/).should == 6
end

it "returns the character byteindex after offset" do
NATFIXME 'Support Regexp', exception: TypeError do
"われわれ".byteindex(/わ/, 3).should == 6
end
"われわれ".byteindex(/わ/, 3).should == 6
end

it "treats the offset as a byteindex" do
NATFIXME 'Support Regexp', exception: TypeError do
"われわわれ".byteindex(/わ/, 6).should == 6
end
"われわわれ".byteindex(/わ/, 6).should == 6
end
end
end
20 changes: 8 additions & 12 deletions spec/core/string/shared/byte_index_common.rb
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,10 @@
end

it "raises an Encoding::CompatibilityError if the encodings are incompatible" do
NATFIXME 'Support Regexp', exception: SpecFailedException, message: /should have raised Encoding::CompatibilityError/ do
re = Regexp.new "れ".encode(Encoding::EUC_JP)
-> do
"あれ".send(@method, re)
end.should raise_error(Encoding::CompatibilityError, "incompatible encoding regexp match (EUC-JP regexp with UTF-8 string)")
end
re = Regexp.new "れ".encode(Encoding::EUC_JP)
-> do
"あれ".send(@method, re)
end.should raise_error(Encoding::CompatibilityError, "incompatible encoding regexp match (EUC-JP regexp with UTF-8 string)")
end
end

Expand All @@ -55,13 +53,11 @@
end

it "sets $~ to MatchData of match and nil when there's none" do
NATFIXME 'Support Regexp', exception: TypeError do
'hello.'.send(@method, /.e./)
$~[0].should == 'hel'
'hello.'.send(@method, /.e./)
$~[0].should == 'hel'

'hello.'.send(@method, /not/)
$~.should == nil
end
'hello.'.send(@method, /not/)
$~.should == nil
end
end
end
5 changes: 5 additions & 0 deletions src/encoding_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,11 @@ void EncodingObject::raise_encoding_invalid_byte_sequence_error(Env *env, const
env->raise(InvalidByteSequenceError, message);
}

void EncodingObject::raise_compatibility_error(Env *env, const EncodingObject *other_encoding) const {
auto exception_class = fetch_nested_const({ "Encoding"_s, "CompatibilityError"_s })->as_class();
env->raise(exception_class, "incompatible character encodings: {} and {}", name()->string(), other_encoding->name()->string());
}

Value EncodingObject::inspect(Env *env) const {
if (is_dummy())
return StringObject::format("#<Encoding:{} (dummy)>", name());
Expand Down
1 change: 1 addition & 0 deletions src/regexp_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,7 @@ int RegexpObject::search(Env *env, const StringObject *string_obj, int start, On
const unsigned char *char_start = unsigned_str + start;
const unsigned char *char_range = char_end;

// FIXME: check if it's already FIXEDENCODING
if (string_obj->encoding() != encoding()) {
RegexpObject temp_regexp;
temp_regexp.initialize_internal(env, m_pattern, m_options | RegexOpts::FixedEncoding);
Expand Down
Loading

0 comments on commit 63cec98

Please sign in to comment.