diff --git a/spec/core/string/dump_spec.rb b/spec/core/string/dump_spec.rb index 39da2ec08..cab8beff5 100644 --- a/spec/core/string/dump_spec.rb +++ b/spec/core/string/dump_spec.rb @@ -314,34 +314,32 @@ end it "returns a string with non-printing single-byte UTF-8 characters replaced by \\x notation" do - NATFIXME 'returns a string with non-printing single-byte UTF-8 characters replaced by \\x notation', exception: SpecFailedException do - [ [0000.chr('utf-8'), '"\x00"'], - [0001.chr('utf-8'), '"\x01"'], - [0002.chr('utf-8'), '"\x02"'], - [0003.chr('utf-8'), '"\x03"'], - [0004.chr('utf-8'), '"\x04"'], - [0005.chr('utf-8'), '"\x05"'], - [0006.chr('utf-8'), '"\x06"'], - [0016.chr('utf-8'), '"\x0E"'], - [0017.chr('utf-8'), '"\x0F"'], - [0020.chr('utf-8'), '"\x10"'], - [0021.chr('utf-8'), '"\x11"'], - [0022.chr('utf-8'), '"\x12"'], - [0023.chr('utf-8'), '"\x13"'], - [0024.chr('utf-8'), '"\x14"'], - [0025.chr('utf-8'), '"\x15"'], - [0026.chr('utf-8'), '"\x16"'], - [0027.chr('utf-8'), '"\x17"'], - [0030.chr('utf-8'), '"\x18"'], - [0031.chr('utf-8'), '"\x19"'], - [0032.chr('utf-8'), '"\x1A"'], - [0034.chr('utf-8'), '"\x1C"'], - [0035.chr('utf-8'), '"\x1D"'], - [0036.chr('utf-8'), '"\x1E"'], - [0037.chr('utf-8'), '"\x1F"'], - [0177.chr('utf-8'), '"\x7F"'] - ].should be_computed_by(:dump) - end + [ [0000.chr('utf-8'), '"\x00"'], + [0001.chr('utf-8'), '"\x01"'], + [0002.chr('utf-8'), '"\x02"'], + [0003.chr('utf-8'), '"\x03"'], + [0004.chr('utf-8'), '"\x04"'], + [0005.chr('utf-8'), '"\x05"'], + [0006.chr('utf-8'), '"\x06"'], + [0016.chr('utf-8'), '"\x0E"'], + [0017.chr('utf-8'), '"\x0F"'], + [0020.chr('utf-8'), '"\x10"'], + [0021.chr('utf-8'), '"\x11"'], + [0022.chr('utf-8'), '"\x12"'], + [0023.chr('utf-8'), '"\x13"'], + [0024.chr('utf-8'), '"\x14"'], + [0025.chr('utf-8'), '"\x15"'], + [0026.chr('utf-8'), '"\x16"'], + [0027.chr('utf-8'), '"\x17"'], + [0030.chr('utf-8'), '"\x18"'], + [0031.chr('utf-8'), '"\x19"'], + [0032.chr('utf-8'), '"\x1A"'], + [0034.chr('utf-8'), '"\x1C"'], + [0035.chr('utf-8'), '"\x1D"'], + [0036.chr('utf-8'), '"\x1E"'], + [0037.chr('utf-8'), '"\x1F"'], + [0177.chr('utf-8'), '"\x7F"'] + ].should be_computed_by(:dump) end it "returns a string with multi-byte UTF-8 characters less than or equal 0xFFFF replaced by \\uXXXX notation with upper-case hex digits" do @@ -381,10 +379,8 @@ end it "returns a string with multi-byte UTF-8 characters greater than 0xFFFF replaced by \\u{XXXXXX} notation with upper-case hex digits" do - NATFIXME 'returns a string with multi-byte UTF-8 characters greater than 0xFFFF replaced by \\u{XXXXXX} notation with upper-case hex digits', exception: SpecFailedException do - 0x10000.chr('utf-8').dump.should == '"\u{10000}"' - 0x10FFFF.chr('utf-8').dump.should == '"\u{10FFFF}"' - end + 0x10000.chr('utf-8').dump.should == '"\u{10000}"' + 0x10FFFF.chr('utf-8').dump.should == '"\u{10FFFF}"' end it "includes .force_encoding(name) if the encoding isn't ASCII compatible" do diff --git a/src/encoding/utf8_encoding_object.cpp b/src/encoding/utf8_encoding_object.cpp index 9800edca2..269c1246d 100644 --- a/src/encoding/utf8_encoding_object.cpp +++ b/src/encoding/utf8_encoding_object.cpp @@ -159,8 +159,11 @@ bool Utf8EncodingObject::is_printable_char(const nat_int_t c) const { } String Utf8EncodingObject::escaped_char(const nat_int_t c) const { - char buf[7]; - snprintf(buf, 7, "\\u%04llX", c); + char buf[21]; + if (c > 0xFFFF) + snprintf(buf, sizeof(buf), "\\u{%llX}", c); + else + snprintf(buf, sizeof(buf), "\\u%04llX", c); return String(buf); } diff --git a/src/string_object.cpp b/src/string_object.cpp index e1d489b11..5cf51228e 100644 --- a/src/string_object.cpp +++ b/src/string_object.cpp @@ -503,24 +503,25 @@ Value StringObject::tr_in_place(Env *env, Value from_value, Value to_value) { return this; } -StringObject *StringObject::inspect(Env *env) const { +static StringObject *inspect_internal(const StringObject *str, Env *env, bool for_dump = false) { StringObject *out = new StringObject { "\"" }; + auto encoding = str->encoding(); size_t index = 0; - auto [valid, ch] = next_char_result(&index); + auto [valid, ch] = str->next_char_result(&index); while (!ch.is_empty()) { if (!valid) { for (size_t i = 0; i < ch.size(); i++) out->append_sprintf("\\x%02X", static_cast(ch[i])); - auto pair = next_char_result(&index); + auto pair = str->next_char_result(&index); valid = pair.first; ch = pair.second; continue; } - const auto c = m_encoding->decode_codepoint(ch); - auto pair = next_char_result(&index); + const auto c = encoding->decode_codepoint(ch); + auto pair = str->next_char_result(&index); valid = pair.first; - const auto c2 = !valid || ch.is_empty() ? 0 : m_encoding->decode_codepoint(pair.second); + const auto c2 = !valid || ch.is_empty() ? 0 : encoding->decode_codepoint(pair.second); if (c == '"' || c == '\\' || (c == '#' && (c2 == '{' || c2 == '$' || c2 == '@'))) { out->append_char('\\'); @@ -541,11 +542,13 @@ StringObject *StringObject::inspect(Env *env) const { out->append("\\t"); } else if (c == '\v') { out->append("\\v"); - } else if (m_encoding->is_printable_char(c)) { + } else if (encoding->is_printable_char(c) && (!for_dump || c <= 0xFFFF)) { out->append(ch); } else { - auto escaped_char = m_encoding->escaped_char(c); - out->append(escaped_char); + if (for_dump && c < 128) + out->append_sprintf("\\x%02X", c); + else + out->append(encoding->escaped_char(c)); } ch = pair.second; } @@ -554,6 +557,10 @@ StringObject *StringObject::inspect(Env *env) const { return out; } +StringObject *StringObject::inspect(Env *env) const { + return inspect_internal(this, env); +} + String StringObject::dbg_inspect() const { return String::format("\"{}\"", m_string); } @@ -2922,7 +2929,7 @@ Value StringObject::downcase_in_place(Env *env, Value arg1, Value arg2) { Value StringObject::dump(Env *env) { auto result = new StringObject { m_encoding }; - result->append(inspect(env)); + result->append(inspect_internal(this, env, true)); if (!m_encoding->is_ascii_compatible()) { result->append_sprintf(".force_encoding(\"%s\")", m_encoding->name()->c_str()); }