Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make String#each_line spec-compliant #2109

Merged
merged 2 commits into from
Jun 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion include/natalie/string_object.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

#include <assert.h>
#include <functional>
#include <stdarg.h>
#include <stdint.h>
#include <string.h>
Expand Down Expand Up @@ -214,6 +215,7 @@ class StringObject : public Object {

Value each_grapheme_cluster(Env *, Block *);

void each_line(Env *, Value, Value, std::function<Value(StringObject *)>) const;
Value each_line(Env *, Value = nullptr, Value = nullptr, Block * = nullptr);
Value lines(Env *, Value = nullptr, Value = nullptr, Block * = nullptr);

Expand Down Expand Up @@ -273,7 +275,7 @@ class StringObject : public Object {

Value index(Env *, Value, Value) const;
Value index(Env *, Value, size_t start) const;
nat_int_t index_int(Env *, Value, size_t start) const;
nat_int_t index_int(Env *, Value, size_t byte_start) const;

Value rindex(Env *, Value) const;

Expand Down
64 changes: 30 additions & 34 deletions spec/core/string/shared/each_line.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,24 +42,18 @@

it "passes self as a whole to the block if the separator is nil" do
a = []
NATFIXME 'Support nil as separator', exception: TypeError, message: 'no implicit conversion from nil to string' do
"one\ntwo\r\nthree".send(@method, nil) { |s| a << s }
a.should == ["one\ntwo\r\nthree"]
end
"one\ntwo\r\nthree".send(@method, nil) { |s| a << s }
a.should == ["one\ntwo\r\nthree"]
end

it "yields paragraphs (broken by 2 or more successive newlines) when passed '' and replaces multiple newlines with only two ones" do
a = []
"hello\nworld\n\n\nand\nuniverse\n\n\n\n\n".send(@method, '') { |s| a << s }
NATFIXME 'Support emtpy string as separator', exception: SpecFailedException do
a.should == ["hello\nworld\n\n", "and\nuniverse\n\n"]
end
a.should == ["hello\nworld\n\n", "and\nuniverse\n\n"]

a = []
"hello\nworld\n\n\nand\nuniverse\n\n\n\n\ndog".send(@method, '') { |s| a << s }
NATFIXME 'Support emtpy string as separator', exception: SpecFailedException do
a.should == ["hello\nworld\n\n", "and\nuniverse\n\n", "dog"]
end
a = []
"hello\nworld\n\n\nand\nuniverse\n\n\n\n\ndog".send(@method, '') { |s| a << s }
a.should == ["hello\nworld\n\n", "and\nuniverse\n\n", "dog"]
end

describe "uses $/" do
Expand All @@ -72,22 +66,20 @@
end

it "as the separator when none is given" do
NATFIXME 'Support nil as separator', exception: TypeError, message: 'no implicit conversion from nil to string' do
[
"", "x", "x\ny", "x\ry", "x\r\ny", "x\n\r\r\ny",
"hello hullo bello"
].each do |str|
["", "llo", "\n", "\r", nil].each do |sep|
expected = []
str.send(@method, sep) { |x| expected << x }

suppress_warning {$/ = sep}

actual = []
suppress_warning {str.send(@method) { |x| actual << x }}

actual.should == expected
end
[
"", "x", "x\ny", "x\ry", "x\r\ny", "x\n\r\r\ny",
"hello hullo bello"
].each do |str|
["", "llo", "\n", "\r", nil].each do |sep|
expected = []
str.send(@method, sep) { |x| expected << x }

suppress_warning {$/ = sep}

actual = []
suppress_warning {str.send(@method) { |x| actual << x }}

actual.should == expected
end
end
end
Expand All @@ -106,22 +98,26 @@

it "tries to convert the separator to a string using to_str" do
separator = mock('l')
# separator.should_receive(:to_str).and_return("l")
separator.should_receive(:to_str).and_return("l")

a = []
NATFIXME 'tries to convert the separator to a string using to_str', exception: TypeError, message: 'no implicit conversion' do
"hello\nworld".send(@method, separator) { |s| a << s }
a.should == [ "hel", "l", "o\nworl", "d" ]
end
"hello\nworld".send(@method, separator) { |s| a << s }
a.should == [ "hel", "l", "o\nworl", "d" ]
end

it "does not care if the string is modified while substituting" do
str = "hello\nworld."
str = +"hello\nworld."
out = []
str.send(@method){|x| out << x; str[-1] = '!' }.should == "hello\nworld!"
out.should == ["hello\n", "world."]
end

it "returns Strings in the same encoding as self" do
"one\ntwo\r\nthree".encode("US-ASCII").send(@method) do |s|
s.encoding.should == Encoding::US_ASCII
end
end

it "raises a TypeError when the separator can't be converted to a string" do
-> { "hello world".send(@method, false) {} }.should raise_error(TypeError)
-> { "hello world".send(@method, mock('x')) {} }.should raise_error(TypeError)
Expand Down
133 changes: 86 additions & 47 deletions src/string_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -666,7 +666,7 @@ Value StringObject::index(Env *env, Value needle, size_t start) const {
return Value::integer(0);
}

nat_int_t StringObject::index_int(Env *env, Value needle, size_t start) const {
nat_int_t StringObject::index_int(Env *env, Value needle, size_t byte_start) const {
auto needle_str = needle->to_str(env)->as_string();

if (needle_str->bytesize() == 0)
Expand All @@ -675,13 +675,13 @@ nat_int_t StringObject::index_int(Env *env, Value needle, size_t start) const {
if (bytesize() == 0)
return -1;

if (start >= bytesize())
if (byte_start >= bytesize())
return -1;

if (needle_str->bytesize() > bytesize() - start)
if (needle_str->bytesize() > bytesize() - byte_start)
return -1;

auto ptr = memmem(c_str() + start, bytesize() - start, needle_str->c_str(), needle_str->bytesize());
auto ptr = memmem(c_str() + byte_start, bytesize() - byte_start, needle_str->c_str(), needle_str->bytesize());
if (ptr == nullptr)
return -1;

Expand Down Expand Up @@ -2557,66 +2557,105 @@ Value StringObject::insert(Env *env, Value index_obj, Value other_str) {
return this;
}

// This function is used for both String#each_line and String#lines
static Value lines_inner(Value self, EncodingObject *encoding, bool is_each_line, Env *env, Value separator, Value chomp_value, Block *block) {
void StringObject::each_line(Env *env, Value separator, Value chomp_value, std::function<Value(StringObject *)> callback) const {
if (separator) {
separator->assert_type(env, Object::Type::String, "String");
if (!separator->is_nil())
separator = separator->to_str(env);
} else {
separator = new StringObject { "\n" };
auto dollar_slash = env->global_get("$/"_s);
if (dollar_slash->is_nil())
separator = NilObject::the();
else
separator = dollar_slash->to_str(env);
}

auto self_dup = duplicate(env)->as_string();

if (is_empty()) {
if (separator->is_nil())
callback(self_dup);
return;
}

if (separator->is_nil())
separator = new StringObject { "" };

bool paragraph_mode = false;
if (separator->as_string()->is_empty()) {
paragraph_mode = true;
separator = new StringObject { "\n\n" };
}

const auto chomp = chomp_value ? chomp_value->is_truthy() : false;
auto separator_length = separator->as_string()->length();

size_t last_index = 0;
auto self_dup = self->duplicate(env)->as_string();
auto index = self_dup->index_int(env, separator->as_string(), 0);
for (;;) {
auto index = self_dup->index_int(env, separator, last_index);
auto ptr = &self_dup->c_str()[last_index];

auto run_split = [&](auto callback) {
if (separator->as_string()->is_empty() || index == -1) {
callback(self_dup);
} else {
do {
const auto u_index = static_cast<size_t>(index);
auto out = new StringObject { &self_dup->c_str()[last_index], u_index - last_index + separator->as_string()->length(), encoding };
if (chomp) out->chomp_in_place(env, separator);
callback(out);
last_index = u_index + separator->as_string()->length();
index = self_dup->index_int(env, separator->as_string(), last_index);
} while (index != -1);
auto out = new StringObject { &self_dup->c_str()[last_index], self_dup->length() - last_index, encoding };
if (chomp) out->chomp_in_place(env, separator);
if (!out->is_empty()) callback(out);
if (index == -1) {
auto length = bytesize() - last_index;
if (length == 0)
break;
auto out = new StringObject { ptr, bytesize() - last_index, encoding() };
callback(out);
break;
}
};

if (block) {
run_split([&](Value out) {
Value args[] = { out };
return NAT_RUN_BLOCK_AND_POSSIBLY_BREAK(env, block, Args(1, args), nullptr);
});
return self;
} else if (is_each_line) {
auto u_index = static_cast<size_t>(index);
auto length = u_index - last_index + separator_length;

if (paragraph_mode) {
for (size_t i = u_index + separator_length; i < bytesize(); i++) {
if (self_dup->c_str()[i] == '\n')
u_index++;
else
break;
}
}

auto out = new StringObject { ptr, length, encoding() };

if (chomp)
out->chomp_in_place(env, separator);

callback(out);

last_index = u_index + separator_length;
}
}

Value StringObject::each_line(Env *env, Value separator, Value chomp, Block *block) {
if (!block) {
Vector<Value> args { separator };
if (chomp) {
auto do_chomp = chomp ? chomp->is_truthy() : false;
if (do_chomp) {
auto hash = new HashObject {};
hash->put(env, "chomp"_s, chomp_value);
hash->put(env, "chomp"_s, chomp);
args.push(hash);
}
return self->enum_for(env, "each_line", Args(args, chomp));
} else {
ArrayObject *ary = new ArrayObject {};
run_split([&](Value out) {
ary->push(out);
});
return ary;
return enum_for(env, "each_line", Args(args, do_chomp));
}
}

Value StringObject::each_line(Env *env, Value separator, Value chomp_value, Block *block) {
return lines_inner(this, m_encoding, true, env, separator, chomp_value, block);
each_line(env, separator, chomp, [&](StringObject *part) -> Value {
Value args[] = { part };
NAT_RUN_BLOCK_AND_POSSIBLY_BREAK(env, block, Args(1, args), nullptr);
return this;
});
return this;
}

Value StringObject::lines(Env *env, Value separator, Value chomp_value, Block *block) {
return lines_inner(this, m_encoding, false, env, separator, chomp_value, block);
Value StringObject::lines(Env *env, Value separator, Value chomp, Block *block) {
if (block)
return each_line(env, separator, chomp, block);

ArrayObject *ary = new ArrayObject {};
each_line(env, separator, chomp, [&](StringObject *part) -> Value {
ary->push(part);
return this;
});
return ary;
}

Value StringObject::ljust(Env *env, Value length_obj, Value pad_obj) const {
Expand Down
Loading