Merge pull request #2091 from natalie-lang/string-dump

Make String#dump spec-compliant
natalie-lang · Jun 11, 2024 · abf4de4 · abf4de4
2 parents 4864137 + cf7c61e
commit abf4de4
Show file tree

Hide file tree

Showing 3 changed files with 50 additions and 44 deletions.
diff --git a/spec/core/string/dump_spec.rb b/spec/core/string/dump_spec.rb
@@ -314,34 +314,32 @@
   end
 
   it "returns a string with non-printing single-byte UTF-8 characters replaced by \\x notation" do
-    NATFIXME 'returns a string with non-printing single-byte UTF-8 characters replaced by \\x notation', exception: SpecFailedException do
-      [ [0000.chr('utf-8'), '"\x00"'],
-        [0001.chr('utf-8'), '"\x01"'],
-        [0002.chr('utf-8'), '"\x02"'],
-        [0003.chr('utf-8'), '"\x03"'],
-        [0004.chr('utf-8'), '"\x04"'],
-        [0005.chr('utf-8'), '"\x05"'],
-        [0006.chr('utf-8'), '"\x06"'],
-        [0016.chr('utf-8'), '"\x0E"'],
-        [0017.chr('utf-8'), '"\x0F"'],
-        [0020.chr('utf-8'), '"\x10"'],
-        [0021.chr('utf-8'), '"\x11"'],
-        [0022.chr('utf-8'), '"\x12"'],
-        [0023.chr('utf-8'), '"\x13"'],
-        [0024.chr('utf-8'), '"\x14"'],
-        [0025.chr('utf-8'), '"\x15"'],
-        [0026.chr('utf-8'), '"\x16"'],
-        [0027.chr('utf-8'), '"\x17"'],
-        [0030.chr('utf-8'), '"\x18"'],
-        [0031.chr('utf-8'), '"\x19"'],
-        [0032.chr('utf-8'), '"\x1A"'],
-        [0034.chr('utf-8'), '"\x1C"'],
-        [0035.chr('utf-8'), '"\x1D"'],
-        [0036.chr('utf-8'), '"\x1E"'],
-        [0037.chr('utf-8'), '"\x1F"'],
-        [0177.chr('utf-8'), '"\x7F"']
-      ].should be_computed_by(:dump)
-    end
+    [ [0000.chr('utf-8'), '"\x00"'],
+      [0001.chr('utf-8'), '"\x01"'],
+      [0002.chr('utf-8'), '"\x02"'],
+      [0003.chr('utf-8'), '"\x03"'],
+      [0004.chr('utf-8'), '"\x04"'],
+      [0005.chr('utf-8'), '"\x05"'],
+      [0006.chr('utf-8'), '"\x06"'],
+      [0016.chr('utf-8'), '"\x0E"'],
+      [0017.chr('utf-8'), '"\x0F"'],
+      [0020.chr('utf-8'), '"\x10"'],
+      [0021.chr('utf-8'), '"\x11"'],
+      [0022.chr('utf-8'), '"\x12"'],
+      [0023.chr('utf-8'), '"\x13"'],
+      [0024.chr('utf-8'), '"\x14"'],
+      [0025.chr('utf-8'), '"\x15"'],
+      [0026.chr('utf-8'), '"\x16"'],
+      [0027.chr('utf-8'), '"\x17"'],
+      [0030.chr('utf-8'), '"\x18"'],
+      [0031.chr('utf-8'), '"\x19"'],
+      [0032.chr('utf-8'), '"\x1A"'],
+      [0034.chr('utf-8'), '"\x1C"'],
+      [0035.chr('utf-8'), '"\x1D"'],
+      [0036.chr('utf-8'), '"\x1E"'],
+      [0037.chr('utf-8'), '"\x1F"'],
+      [0177.chr('utf-8'), '"\x7F"']
+    ].should be_computed_by(:dump)
   end
 
   it "returns a string with multi-byte UTF-8 characters less than or equal 0xFFFF replaced by \\uXXXX notation with upper-case hex digits" do
@@ -381,10 +379,8 @@
   end
 
   it "returns a string with multi-byte UTF-8 characters greater than 0xFFFF replaced by \\u{XXXXXX} notation with upper-case hex digits" do
-    NATFIXME 'returns a string with multi-byte UTF-8 characters greater than 0xFFFF replaced by \\u{XXXXXX} notation with upper-case hex digits', exception: SpecFailedException do
-      0x10000.chr('utf-8').dump.should == '"\u{10000}"'
-      0x10FFFF.chr('utf-8').dump.should == '"\u{10FFFF}"'
-    end
+    0x10000.chr('utf-8').dump.should == '"\u{10000}"'
+    0x10FFFF.chr('utf-8').dump.should == '"\u{10FFFF}"'
   end
 
   it "includes .force_encoding(name) if the encoding isn't ASCII compatible" do

diff --git a/src/encoding/utf8_encoding_object.cpp b/src/encoding/utf8_encoding_object.cpp
@@ -159,8 +159,11 @@ bool Utf8EncodingObject::is_printable_char(const nat_int_t c) const {
 }
 
 String Utf8EncodingObject::escaped_char(const nat_int_t c) const {
-    char buf[7];
-    snprintf(buf, 7, "\\u%04llX", c);
+    char buf[21];
+    if (c > 0xFFFF)
+        snprintf(buf, sizeof(buf), "\\u{%llX}", c);
+    else
+        snprintf(buf, sizeof(buf), "\\u%04llX", c);
     return String(buf);
 }
 

diff --git a/src/string_object.cpp b/src/string_object.cpp
@@ -503,24 +503,25 @@ Value StringObject::tr_in_place(Env *env, Value from_value, Value to_value) {
     return this;
 }
 
-StringObject *StringObject::inspect(Env *env) const {
+static StringObject *inspect_internal(const StringObject *str, Env *env, bool for_dump = false) {
     StringObject *out = new StringObject { "\"" };
+    auto encoding = str->encoding();
 
     size_t index = 0;
-    auto [valid, ch] = next_char_result(&index);
+    auto [valid, ch] = str->next_char_result(&index);
     while (!ch.is_empty()) {
         if (!valid) {
             for (size_t i = 0; i < ch.size(); i++)
                 out->append_sprintf("\\x%02X", static_cast<uint8_t>(ch[i]));
-            auto pair = next_char_result(&index);
+            auto pair = str->next_char_result(&index);
             valid = pair.first;
             ch = pair.second;
             continue;
         }
-        const auto c = m_encoding->decode_codepoint(ch);
-        auto pair = next_char_result(&index);
+        const auto c = encoding->decode_codepoint(ch);
+        auto pair = str->next_char_result(&index);
         valid = pair.first;
-        const auto c2 = !valid || ch.is_empty() ? 0 : m_encoding->decode_codepoint(pair.second);
+        const auto c2 = !valid || ch.is_empty() ? 0 : encoding->decode_codepoint(pair.second);
 
         if (c == '"' || c == '\\' || (c == '#' && (c2 == '{' || c2 == '$' || c2 == '@'))) {
             out->append_char('\\');
@@ -541,11 +542,13 @@ StringObject *StringObject::inspect(Env *env) const {
             out->append("\\t");
         } else if (c == '\v') {
             out->append("\\v");
-        } else if (m_encoding->is_printable_char(c)) {
+        } else if (encoding->is_printable_char(c) && (!for_dump || c <= 0xFFFF)) {
             out->append(ch);
         } else {
-            auto escaped_char = m_encoding->escaped_char(c);
-            out->append(escaped_char);
+            if (for_dump && c < 128)
+                out->append_sprintf("\\x%02X", c);
+            else
+                out->append(encoding->escaped_char(c));
         }
         ch = pair.second;
     }
@@ -554,6 +557,10 @@ StringObject *StringObject::inspect(Env *env) const {
     return out;
 }
 
+StringObject *StringObject::inspect(Env *env) const {
+    return inspect_internal(this, env);
+}
+
 String StringObject::dbg_inspect() const {
     return String::format("\"{}\"", m_string);
 }
@@ -2922,7 +2929,7 @@ Value StringObject::downcase_in_place(Env *env, Value arg1, Value arg2) {
 
 Value StringObject::dump(Env *env) {
     auto result = new StringObject { m_encoding };
-    result->append(inspect(env));
+    result->append(inspect_internal(this, env, true));
     if (!m_encoding->is_ascii_compatible()) {
         result->append_sprintf(".force_encoding(\"%s\")", m_encoding->name()->c_str());
     }