Skip to content

Commit

Permalink
Merge pull request #2109 from natalie-lang/string-each_line
Browse files Browse the repository at this point in the history
  • Loading branch information
seven1m authored Jun 16, 2024
2 parents ce4e6e9 + c09263e commit e72dc28
Show file tree
Hide file tree
Showing 3 changed files with 119 additions and 82 deletions.
4 changes: 3 additions & 1 deletion include/natalie/string_object.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

#include <assert.h>
#include <functional>
#include <stdarg.h>
#include <stdint.h>
#include <string.h>
Expand Down Expand Up @@ -214,6 +215,7 @@ class StringObject : public Object {

Value each_grapheme_cluster(Env *, Block *);

void each_line(Env *, Value, Value, std::function<Value(StringObject *)>) const;
Value each_line(Env *, Value = nullptr, Value = nullptr, Block * = nullptr);
Value lines(Env *, Value = nullptr, Value = nullptr, Block * = nullptr);

Expand Down Expand Up @@ -273,7 +275,7 @@ class StringObject : public Object {

Value index(Env *, Value, Value) const;
Value index(Env *, Value, size_t start) const;
nat_int_t index_int(Env *, Value, size_t start) const;
nat_int_t index_int(Env *, Value, size_t byte_start) const;

Value rindex(Env *, Value) const;

Expand Down
64 changes: 30 additions & 34 deletions spec/core/string/shared/each_line.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,24 +42,18 @@

it "passes self as a whole to the block if the separator is nil" do
a = []
NATFIXME 'Support nil as separator', exception: TypeError, message: 'no implicit conversion from nil to string' do
"one\ntwo\r\nthree".send(@method, nil) { |s| a << s }
a.should == ["one\ntwo\r\nthree"]
end
"one\ntwo\r\nthree".send(@method, nil) { |s| a << s }
a.should == ["one\ntwo\r\nthree"]
end

it "yields paragraphs (broken by 2 or more successive newlines) when passed '' and replaces multiple newlines with only two ones" do
a = []
"hello\nworld\n\n\nand\nuniverse\n\n\n\n\n".send(@method, '') { |s| a << s }
NATFIXME 'Support emtpy string as separator', exception: SpecFailedException do
a.should == ["hello\nworld\n\n", "and\nuniverse\n\n"]
end
a.should == ["hello\nworld\n\n", "and\nuniverse\n\n"]

a = []
"hello\nworld\n\n\nand\nuniverse\n\n\n\n\ndog".send(@method, '') { |s| a << s }
NATFIXME 'Support emtpy string as separator', exception: SpecFailedException do
a.should == ["hello\nworld\n\n", "and\nuniverse\n\n", "dog"]
end
a = []
"hello\nworld\n\n\nand\nuniverse\n\n\n\n\ndog".send(@method, '') { |s| a << s }
a.should == ["hello\nworld\n\n", "and\nuniverse\n\n", "dog"]
end

describe "uses $/" do
Expand All @@ -72,22 +66,20 @@
end

it "as the separator when none is given" do
NATFIXME 'Support nil as separator', exception: TypeError, message: 'no implicit conversion from nil to string' do
[
"", "x", "x\ny", "x\ry", "x\r\ny", "x\n\r\r\ny",
"hello hullo bello"
].each do |str|
["", "llo", "\n", "\r", nil].each do |sep|
expected = []
str.send(@method, sep) { |x| expected << x }

suppress_warning {$/ = sep}

actual = []
suppress_warning {str.send(@method) { |x| actual << x }}

actual.should == expected
end
[
"", "x", "x\ny", "x\ry", "x\r\ny", "x\n\r\r\ny",
"hello hullo bello"
].each do |str|
["", "llo", "\n", "\r", nil].each do |sep|
expected = []
str.send(@method, sep) { |x| expected << x }

suppress_warning {$/ = sep}

actual = []
suppress_warning {str.send(@method) { |x| actual << x }}

actual.should == expected
end
end
end
Expand All @@ -106,22 +98,26 @@

it "tries to convert the separator to a string using to_str" do
separator = mock('l')
# separator.should_receive(:to_str).and_return("l")
separator.should_receive(:to_str).and_return("l")

a = []
NATFIXME 'tries to convert the separator to a string using to_str', exception: TypeError, message: 'no implicit conversion' do
"hello\nworld".send(@method, separator) { |s| a << s }
a.should == [ "hel", "l", "o\nworl", "d" ]
end
"hello\nworld".send(@method, separator) { |s| a << s }
a.should == [ "hel", "l", "o\nworl", "d" ]
end

it "does not care if the string is modified while substituting" do
str = "hello\nworld."
str = +"hello\nworld."
out = []
str.send(@method){|x| out << x; str[-1] = '!' }.should == "hello\nworld!"
out.should == ["hello\n", "world."]
end

it "returns Strings in the same encoding as self" do
"one\ntwo\r\nthree".encode("US-ASCII").send(@method) do |s|
s.encoding.should == Encoding::US_ASCII
end
end

it "raises a TypeError when the separator can't be converted to a string" do
-> { "hello world".send(@method, false) {} }.should raise_error(TypeError)
-> { "hello world".send(@method, mock('x')) {} }.should raise_error(TypeError)
Expand Down
133 changes: 86 additions & 47 deletions src/string_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -666,7 +666,7 @@ Value StringObject::index(Env *env, Value needle, size_t start) const {
return Value::integer(0);
}

nat_int_t StringObject::index_int(Env *env, Value needle, size_t start) const {
nat_int_t StringObject::index_int(Env *env, Value needle, size_t byte_start) const {
auto needle_str = needle->to_str(env)->as_string();

if (needle_str->bytesize() == 0)
Expand All @@ -675,13 +675,13 @@ nat_int_t StringObject::index_int(Env *env, Value needle, size_t start) const {
if (bytesize() == 0)
return -1;

if (start >= bytesize())
if (byte_start >= bytesize())
return -1;

if (needle_str->bytesize() > bytesize() - start)
if (needle_str->bytesize() > bytesize() - byte_start)
return -1;

auto ptr = memmem(c_str() + start, bytesize() - start, needle_str->c_str(), needle_str->bytesize());
auto ptr = memmem(c_str() + byte_start, bytesize() - byte_start, needle_str->c_str(), needle_str->bytesize());
if (ptr == nullptr)
return -1;

Expand Down Expand Up @@ -2557,66 +2557,105 @@ Value StringObject::insert(Env *env, Value index_obj, Value other_str) {
return this;
}

// This function is used for both String#each_line and String#lines
static Value lines_inner(Value self, EncodingObject *encoding, bool is_each_line, Env *env, Value separator, Value chomp_value, Block *block) {
void StringObject::each_line(Env *env, Value separator, Value chomp_value, std::function<Value(StringObject *)> callback) const {
if (separator) {
separator->assert_type(env, Object::Type::String, "String");
if (!separator->is_nil())
separator = separator->to_str(env);
} else {
separator = new StringObject { "\n" };
auto dollar_slash = env->global_get("$/"_s);
if (dollar_slash->is_nil())
separator = NilObject::the();
else
separator = dollar_slash->to_str(env);
}

auto self_dup = duplicate(env)->as_string();

if (is_empty()) {
if (separator->is_nil())
callback(self_dup);
return;
}

if (separator->is_nil())
separator = new StringObject { "" };

bool paragraph_mode = false;
if (separator->as_string()->is_empty()) {
paragraph_mode = true;
separator = new StringObject { "\n\n" };
}

const auto chomp = chomp_value ? chomp_value->is_truthy() : false;
auto separator_length = separator->as_string()->length();

size_t last_index = 0;
auto self_dup = self->duplicate(env)->as_string();
auto index = self_dup->index_int(env, separator->as_string(), 0);
for (;;) {
auto index = self_dup->index_int(env, separator, last_index);
auto ptr = &self_dup->c_str()[last_index];

auto run_split = [&](auto callback) {
if (separator->as_string()->is_empty() || index == -1) {
callback(self_dup);
} else {
do {
const auto u_index = static_cast<size_t>(index);
auto out = new StringObject { &self_dup->c_str()[last_index], u_index - last_index + separator->as_string()->length(), encoding };
if (chomp) out->chomp_in_place(env, separator);
callback(out);
last_index = u_index + separator->as_string()->length();
index = self_dup->index_int(env, separator->as_string(), last_index);
} while (index != -1);
auto out = new StringObject { &self_dup->c_str()[last_index], self_dup->length() - last_index, encoding };
if (chomp) out->chomp_in_place(env, separator);
if (!out->is_empty()) callback(out);
if (index == -1) {
auto length = bytesize() - last_index;
if (length == 0)
break;
auto out = new StringObject { ptr, bytesize() - last_index, encoding() };
callback(out);
break;
}
};

if (block) {
run_split([&](Value out) {
Value args[] = { out };
return NAT_RUN_BLOCK_AND_POSSIBLY_BREAK(env, block, Args(1, args), nullptr);
});
return self;
} else if (is_each_line) {
auto u_index = static_cast<size_t>(index);
auto length = u_index - last_index + separator_length;

if (paragraph_mode) {
for (size_t i = u_index + separator_length; i < bytesize(); i++) {
if (self_dup->c_str()[i] == '\n')
u_index++;
else
break;
}
}

auto out = new StringObject { ptr, length, encoding() };

if (chomp)
out->chomp_in_place(env, separator);

callback(out);

last_index = u_index + separator_length;
}
}

Value StringObject::each_line(Env *env, Value separator, Value chomp, Block *block) {
if (!block) {
Vector<Value> args { separator };
if (chomp) {
auto do_chomp = chomp ? chomp->is_truthy() : false;
if (do_chomp) {
auto hash = new HashObject {};
hash->put(env, "chomp"_s, chomp_value);
hash->put(env, "chomp"_s, chomp);
args.push(hash);
}
return self->enum_for(env, "each_line", Args(args, chomp));
} else {
ArrayObject *ary = new ArrayObject {};
run_split([&](Value out) {
ary->push(out);
});
return ary;
return enum_for(env, "each_line", Args(args, do_chomp));
}
}

Value StringObject::each_line(Env *env, Value separator, Value chomp_value, Block *block) {
return lines_inner(this, m_encoding, true, env, separator, chomp_value, block);
each_line(env, separator, chomp, [&](StringObject *part) -> Value {
Value args[] = { part };
NAT_RUN_BLOCK_AND_POSSIBLY_BREAK(env, block, Args(1, args), nullptr);
return this;
});
return this;
}

Value StringObject::lines(Env *env, Value separator, Value chomp_value, Block *block) {
return lines_inner(this, m_encoding, false, env, separator, chomp_value, block);
Value StringObject::lines(Env *env, Value separator, Value chomp, Block *block) {
if (block)
return each_line(env, separator, chomp, block);

ArrayObject *ary = new ArrayObject {};
each_line(env, separator, chomp, [&](StringObject *part) -> Value {
ary->push(part);
return this;
});
return ary;
}

Value StringObject::ljust(Env *env, Value length_obj, Value pad_obj) const {
Expand Down

0 comments on commit e72dc28

Please sign in to comment.