From d58153b8988bb910129a3a42abdec4981bd11e42 Mon Sep 17 00:00:00 2001 From: Tim Morgan Date: Sat, 15 Jun 2024 22:39:50 -0500 Subject: [PATCH 1/2] Clarify that we use byte start for StringObject::index_int This was confusing me because String#index should use character start (which we'll need to fix as well.) --- include/natalie/string_object.hpp | 2 +- src/string_object.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/natalie/string_object.hpp b/include/natalie/string_object.hpp index 70d39cc0b..405c69fca 100644 --- a/include/natalie/string_object.hpp +++ b/include/natalie/string_object.hpp @@ -273,7 +273,7 @@ class StringObject : public Object { Value index(Env *, Value, Value) const; Value index(Env *, Value, size_t start) const; - nat_int_t index_int(Env *, Value, size_t start) const; + nat_int_t index_int(Env *, Value, size_t byte_start) const; Value rindex(Env *, Value) const; diff --git a/src/string_object.cpp b/src/string_object.cpp index e90336054..0b1c264d4 100644 --- a/src/string_object.cpp +++ b/src/string_object.cpp @@ -666,7 +666,7 @@ Value StringObject::index(Env *env, Value needle, size_t start) const { return Value::integer(0); } -nat_int_t StringObject::index_int(Env *env, Value needle, size_t start) const { +nat_int_t StringObject::index_int(Env *env, Value needle, size_t byte_start) const { auto needle_str = needle->to_str(env)->as_string(); if (needle_str->bytesize() == 0) @@ -675,13 +675,13 @@ nat_int_t StringObject::index_int(Env *env, Value needle, size_t start) const { if (bytesize() == 0) return -1; - if (start >= bytesize()) + if (byte_start >= bytesize()) return -1; - if (needle_str->bytesize() > bytesize() - start) + if (needle_str->bytesize() > bytesize() - byte_start) return -1; - auto ptr = memmem(c_str() + start, bytesize() - start, needle_str->c_str(), needle_str->bytesize()); + auto ptr = memmem(c_str() + byte_start, bytesize() - byte_start, needle_str->c_str(), needle_str->bytesize()); if (ptr == nullptr) return -1; From c09263edbca4065c5fb1b6b7fd812f6a868199de Mon Sep 17 00:00:00 2001 From: Tim Morgan Date: Sat, 15 Jun 2024 20:50:32 -0500 Subject: [PATCH 2/2] Make StringObject::each_line spec-compliant --- include/natalie/string_object.hpp | 2 + spec/core/string/shared/each_line.rb | 64 +++++++------- src/string_object.cpp | 125 ++++++++++++++++++--------- 3 files changed, 114 insertions(+), 77 deletions(-) diff --git a/include/natalie/string_object.hpp b/include/natalie/string_object.hpp index 405c69fca..95c7955cf 100644 --- a/include/natalie/string_object.hpp +++ b/include/natalie/string_object.hpp @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -214,6 +215,7 @@ class StringObject : public Object { Value each_grapheme_cluster(Env *, Block *); + void each_line(Env *, Value, Value, std::function) const; Value each_line(Env *, Value = nullptr, Value = nullptr, Block * = nullptr); Value lines(Env *, Value = nullptr, Value = nullptr, Block * = nullptr); diff --git a/spec/core/string/shared/each_line.rb b/spec/core/string/shared/each_line.rb index 50c134c2d..231a6d9d4 100644 --- a/spec/core/string/shared/each_line.rb +++ b/spec/core/string/shared/each_line.rb @@ -42,24 +42,18 @@ it "passes self as a whole to the block if the separator is nil" do a = [] - NATFIXME 'Support nil as separator', exception: TypeError, message: 'no implicit conversion from nil to string' do - "one\ntwo\r\nthree".send(@method, nil) { |s| a << s } - a.should == ["one\ntwo\r\nthree"] - end + "one\ntwo\r\nthree".send(@method, nil) { |s| a << s } + a.should == ["one\ntwo\r\nthree"] end it "yields paragraphs (broken by 2 or more successive newlines) when passed '' and replaces multiple newlines with only two ones" do a = [] "hello\nworld\n\n\nand\nuniverse\n\n\n\n\n".send(@method, '') { |s| a << s } - NATFIXME 'Support emtpy string as separator', exception: SpecFailedException do - a.should == ["hello\nworld\n\n", "and\nuniverse\n\n"] - end + a.should == ["hello\nworld\n\n", "and\nuniverse\n\n"] - a = [] - "hello\nworld\n\n\nand\nuniverse\n\n\n\n\ndog".send(@method, '') { |s| a << s } - NATFIXME 'Support emtpy string as separator', exception: SpecFailedException do - a.should == ["hello\nworld\n\n", "and\nuniverse\n\n", "dog"] - end + a = [] + "hello\nworld\n\n\nand\nuniverse\n\n\n\n\ndog".send(@method, '') { |s| a << s } + a.should == ["hello\nworld\n\n", "and\nuniverse\n\n", "dog"] end describe "uses $/" do @@ -72,22 +66,20 @@ end it "as the separator when none is given" do - NATFIXME 'Support nil as separator', exception: TypeError, message: 'no implicit conversion from nil to string' do - [ - "", "x", "x\ny", "x\ry", "x\r\ny", "x\n\r\r\ny", - "hello hullo bello" - ].each do |str| - ["", "llo", "\n", "\r", nil].each do |sep| - expected = [] - str.send(@method, sep) { |x| expected << x } - - suppress_warning {$/ = sep} - - actual = [] - suppress_warning {str.send(@method) { |x| actual << x }} - - actual.should == expected - end + [ + "", "x", "x\ny", "x\ry", "x\r\ny", "x\n\r\r\ny", + "hello hullo bello" + ].each do |str| + ["", "llo", "\n", "\r", nil].each do |sep| + expected = [] + str.send(@method, sep) { |x| expected << x } + + suppress_warning {$/ = sep} + + actual = [] + suppress_warning {str.send(@method) { |x| actual << x }} + + actual.should == expected end end end @@ -106,22 +98,26 @@ it "tries to convert the separator to a string using to_str" do separator = mock('l') - # separator.should_receive(:to_str).and_return("l") + separator.should_receive(:to_str).and_return("l") a = [] - NATFIXME 'tries to convert the separator to a string using to_str', exception: TypeError, message: 'no implicit conversion' do - "hello\nworld".send(@method, separator) { |s| a << s } - a.should == [ "hel", "l", "o\nworl", "d" ] - end + "hello\nworld".send(@method, separator) { |s| a << s } + a.should == [ "hel", "l", "o\nworl", "d" ] end it "does not care if the string is modified while substituting" do - str = "hello\nworld." + str = +"hello\nworld." out = [] str.send(@method){|x| out << x; str[-1] = '!' }.should == "hello\nworld!" out.should == ["hello\n", "world."] end + it "returns Strings in the same encoding as self" do + "one\ntwo\r\nthree".encode("US-ASCII").send(@method) do |s| + s.encoding.should == Encoding::US_ASCII + end + end + it "raises a TypeError when the separator can't be converted to a string" do -> { "hello world".send(@method, false) {} }.should raise_error(TypeError) -> { "hello world".send(@method, mock('x')) {} }.should raise_error(TypeError) diff --git a/src/string_object.cpp b/src/string_object.cpp index 0b1c264d4..b4f80e65a 100644 --- a/src/string_object.cpp +++ b/src/string_object.cpp @@ -2557,66 +2557,105 @@ Value StringObject::insert(Env *env, Value index_obj, Value other_str) { return this; } -// This function is used for both String#each_line and String#lines -static Value lines_inner(Value self, EncodingObject *encoding, bool is_each_line, Env *env, Value separator, Value chomp_value, Block *block) { +void StringObject::each_line(Env *env, Value separator, Value chomp_value, std::function callback) const { if (separator) { - separator->assert_type(env, Object::Type::String, "String"); + if (!separator->is_nil()) + separator = separator->to_str(env); } else { - separator = new StringObject { "\n" }; + auto dollar_slash = env->global_get("$/"_s); + if (dollar_slash->is_nil()) + separator = NilObject::the(); + else + separator = dollar_slash->to_str(env); + } + + auto self_dup = duplicate(env)->as_string(); + + if (is_empty()) { + if (separator->is_nil()) + callback(self_dup); + return; + } + + if (separator->is_nil()) + separator = new StringObject { "" }; + + bool paragraph_mode = false; + if (separator->as_string()->is_empty()) { + paragraph_mode = true; + separator = new StringObject { "\n\n" }; } const auto chomp = chomp_value ? chomp_value->is_truthy() : false; + auto separator_length = separator->as_string()->length(); + size_t last_index = 0; - auto self_dup = self->duplicate(env)->as_string(); - auto index = self_dup->index_int(env, separator->as_string(), 0); + for (;;) { + auto index = self_dup->index_int(env, separator, last_index); + auto ptr = &self_dup->c_str()[last_index]; - auto run_split = [&](auto callback) { - if (separator->as_string()->is_empty() || index == -1) { - callback(self_dup); - } else { - do { - const auto u_index = static_cast(index); - auto out = new StringObject { &self_dup->c_str()[last_index], u_index - last_index + separator->as_string()->length(), encoding }; - if (chomp) out->chomp_in_place(env, separator); - callback(out); - last_index = u_index + separator->as_string()->length(); - index = self_dup->index_int(env, separator->as_string(), last_index); - } while (index != -1); - auto out = new StringObject { &self_dup->c_str()[last_index], self_dup->length() - last_index, encoding }; - if (chomp) out->chomp_in_place(env, separator); - if (!out->is_empty()) callback(out); + if (index == -1) { + auto length = bytesize() - last_index; + if (length == 0) + break; + auto out = new StringObject { ptr, bytesize() - last_index, encoding() }; + callback(out); + break; } - }; - if (block) { - run_split([&](Value out) { - Value args[] = { out }; - return NAT_RUN_BLOCK_AND_POSSIBLY_BREAK(env, block, Args(1, args), nullptr); - }); - return self; - } else if (is_each_line) { + auto u_index = static_cast(index); + auto length = u_index - last_index + separator_length; + + if (paragraph_mode) { + for (size_t i = u_index + separator_length; i < bytesize(); i++) { + if (self_dup->c_str()[i] == '\n') + u_index++; + else + break; + } + } + + auto out = new StringObject { ptr, length, encoding() }; + + if (chomp) + out->chomp_in_place(env, separator); + + callback(out); + + last_index = u_index + separator_length; + } +} + +Value StringObject::each_line(Env *env, Value separator, Value chomp, Block *block) { + if (!block) { Vector args { separator }; - if (chomp) { + auto do_chomp = chomp ? chomp->is_truthy() : false; + if (do_chomp) { auto hash = new HashObject {}; - hash->put(env, "chomp"_s, chomp_value); + hash->put(env, "chomp"_s, chomp); args.push(hash); } - return self->enum_for(env, "each_line", Args(args, chomp)); - } else { - ArrayObject *ary = new ArrayObject {}; - run_split([&](Value out) { - ary->push(out); - }); - return ary; + return enum_for(env, "each_line", Args(args, do_chomp)); } -} -Value StringObject::each_line(Env *env, Value separator, Value chomp_value, Block *block) { - return lines_inner(this, m_encoding, true, env, separator, chomp_value, block); + each_line(env, separator, chomp, [&](StringObject *part) -> Value { + Value args[] = { part }; + NAT_RUN_BLOCK_AND_POSSIBLY_BREAK(env, block, Args(1, args), nullptr); + return this; + }); + return this; } -Value StringObject::lines(Env *env, Value separator, Value chomp_value, Block *block) { - return lines_inner(this, m_encoding, false, env, separator, chomp_value, block); +Value StringObject::lines(Env *env, Value separator, Value chomp, Block *block) { + if (block) + return each_line(env, separator, chomp, block); + + ArrayObject *ary = new ArrayObject {}; + each_line(env, separator, chomp, [&](StringObject *part) -> Value { + ary->push(part); + return this; + }); + return ary; } Value StringObject::ljust(Env *env, Value length_obj, Value pad_obj) const {