Skip to content

Commit

Permalink
Merge pull request #2091 from natalie-lang/string-dump
Browse files Browse the repository at this point in the history
Make String#dump spec-compliant
  • Loading branch information
seven1m authored Jun 11, 2024
2 parents 4864137 + cf7c61e commit abf4de4
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 44 deletions.
60 changes: 28 additions & 32 deletions spec/core/string/dump_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -314,34 +314,32 @@
end

it "returns a string with non-printing single-byte UTF-8 characters replaced by \\x notation" do
NATFIXME 'returns a string with non-printing single-byte UTF-8 characters replaced by \\x notation', exception: SpecFailedException do
[ [0000.chr('utf-8'), '"\x00"'],
[0001.chr('utf-8'), '"\x01"'],
[0002.chr('utf-8'), '"\x02"'],
[0003.chr('utf-8'), '"\x03"'],
[0004.chr('utf-8'), '"\x04"'],
[0005.chr('utf-8'), '"\x05"'],
[0006.chr('utf-8'), '"\x06"'],
[0016.chr('utf-8'), '"\x0E"'],
[0017.chr('utf-8'), '"\x0F"'],
[0020.chr('utf-8'), '"\x10"'],
[0021.chr('utf-8'), '"\x11"'],
[0022.chr('utf-8'), '"\x12"'],
[0023.chr('utf-8'), '"\x13"'],
[0024.chr('utf-8'), '"\x14"'],
[0025.chr('utf-8'), '"\x15"'],
[0026.chr('utf-8'), '"\x16"'],
[0027.chr('utf-8'), '"\x17"'],
[0030.chr('utf-8'), '"\x18"'],
[0031.chr('utf-8'), '"\x19"'],
[0032.chr('utf-8'), '"\x1A"'],
[0034.chr('utf-8'), '"\x1C"'],
[0035.chr('utf-8'), '"\x1D"'],
[0036.chr('utf-8'), '"\x1E"'],
[0037.chr('utf-8'), '"\x1F"'],
[0177.chr('utf-8'), '"\x7F"']
].should be_computed_by(:dump)
end
[ [0000.chr('utf-8'), '"\x00"'],
[0001.chr('utf-8'), '"\x01"'],
[0002.chr('utf-8'), '"\x02"'],
[0003.chr('utf-8'), '"\x03"'],
[0004.chr('utf-8'), '"\x04"'],
[0005.chr('utf-8'), '"\x05"'],
[0006.chr('utf-8'), '"\x06"'],
[0016.chr('utf-8'), '"\x0E"'],
[0017.chr('utf-8'), '"\x0F"'],
[0020.chr('utf-8'), '"\x10"'],
[0021.chr('utf-8'), '"\x11"'],
[0022.chr('utf-8'), '"\x12"'],
[0023.chr('utf-8'), '"\x13"'],
[0024.chr('utf-8'), '"\x14"'],
[0025.chr('utf-8'), '"\x15"'],
[0026.chr('utf-8'), '"\x16"'],
[0027.chr('utf-8'), '"\x17"'],
[0030.chr('utf-8'), '"\x18"'],
[0031.chr('utf-8'), '"\x19"'],
[0032.chr('utf-8'), '"\x1A"'],
[0034.chr('utf-8'), '"\x1C"'],
[0035.chr('utf-8'), '"\x1D"'],
[0036.chr('utf-8'), '"\x1E"'],
[0037.chr('utf-8'), '"\x1F"'],
[0177.chr('utf-8'), '"\x7F"']
].should be_computed_by(:dump)
end

it "returns a string with multi-byte UTF-8 characters less than or equal 0xFFFF replaced by \\uXXXX notation with upper-case hex digits" do
Expand Down Expand Up @@ -381,10 +379,8 @@
end

it "returns a string with multi-byte UTF-8 characters greater than 0xFFFF replaced by \\u{XXXXXX} notation with upper-case hex digits" do
NATFIXME 'returns a string with multi-byte UTF-8 characters greater than 0xFFFF replaced by \\u{XXXXXX} notation with upper-case hex digits', exception: SpecFailedException do
0x10000.chr('utf-8').dump.should == '"\u{10000}"'
0x10FFFF.chr('utf-8').dump.should == '"\u{10FFFF}"'
end
0x10000.chr('utf-8').dump.should == '"\u{10000}"'
0x10FFFF.chr('utf-8').dump.should == '"\u{10FFFF}"'
end

it "includes .force_encoding(name) if the encoding isn't ASCII compatible" do
Expand Down
7 changes: 5 additions & 2 deletions src/encoding/utf8_encoding_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,11 @@ bool Utf8EncodingObject::is_printable_char(const nat_int_t c) const {
}

String Utf8EncodingObject::escaped_char(const nat_int_t c) const {
char buf[7];
snprintf(buf, 7, "\\u%04llX", c);
char buf[21];
if (c > 0xFFFF)
snprintf(buf, sizeof(buf), "\\u{%llX}", c);
else
snprintf(buf, sizeof(buf), "\\u%04llX", c);
return String(buf);
}

Expand Down
27 changes: 17 additions & 10 deletions src/string_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -503,24 +503,25 @@ Value StringObject::tr_in_place(Env *env, Value from_value, Value to_value) {
return this;
}

StringObject *StringObject::inspect(Env *env) const {
static StringObject *inspect_internal(const StringObject *str, Env *env, bool for_dump = false) {
StringObject *out = new StringObject { "\"" };
auto encoding = str->encoding();

size_t index = 0;
auto [valid, ch] = next_char_result(&index);
auto [valid, ch] = str->next_char_result(&index);
while (!ch.is_empty()) {
if (!valid) {
for (size_t i = 0; i < ch.size(); i++)
out->append_sprintf("\\x%02X", static_cast<uint8_t>(ch[i]));
auto pair = next_char_result(&index);
auto pair = str->next_char_result(&index);
valid = pair.first;
ch = pair.second;
continue;
}
const auto c = m_encoding->decode_codepoint(ch);
auto pair = next_char_result(&index);
const auto c = encoding->decode_codepoint(ch);
auto pair = str->next_char_result(&index);
valid = pair.first;
const auto c2 = !valid || ch.is_empty() ? 0 : m_encoding->decode_codepoint(pair.second);
const auto c2 = !valid || ch.is_empty() ? 0 : encoding->decode_codepoint(pair.second);

if (c == '"' || c == '\\' || (c == '#' && (c2 == '{' || c2 == '$' || c2 == '@'))) {
out->append_char('\\');
Expand All @@ -541,11 +542,13 @@ StringObject *StringObject::inspect(Env *env) const {
out->append("\\t");
} else if (c == '\v') {
out->append("\\v");
} else if (m_encoding->is_printable_char(c)) {
} else if (encoding->is_printable_char(c) && (!for_dump || c <= 0xFFFF)) {
out->append(ch);
} else {
auto escaped_char = m_encoding->escaped_char(c);
out->append(escaped_char);
if (for_dump && c < 128)
out->append_sprintf("\\x%02X", c);
else
out->append(encoding->escaped_char(c));
}
ch = pair.second;
}
Expand All @@ -554,6 +557,10 @@ StringObject *StringObject::inspect(Env *env) const {
return out;
}

StringObject *StringObject::inspect(Env *env) const {
return inspect_internal(this, env);
}

String StringObject::dbg_inspect() const {
return String::format("\"{}\"", m_string);
}
Expand Down Expand Up @@ -2922,7 +2929,7 @@ Value StringObject::downcase_in_place(Env *env, Value arg1, Value arg2) {

Value StringObject::dump(Env *env) {
auto result = new StringObject { m_encoding };
result->append(inspect(env));
result->append(inspect_internal(this, env, true));
if (!m_encoding->is_ascii_compatible()) {
result->append_sprintf(".force_encoding(\"%s\")", m_encoding->name()->c_str());
}
Expand Down

0 comments on commit abf4de4

Please sign in to comment.