From 28fc13179b8895b6d5f986e24d429d4a36a616f6 Mon Sep 17 00:00:00 2001 From: Tim Morgan Date: Sat, 22 Jun 2024 07:17:44 -0500 Subject: [PATCH 1/5] Add :fallback option to String#encode --- include/natalie/encoding_object.hpp | 1 + spec/core/string/encode_spec.rb | 32 ++++++------- src/encoding_object.cpp | 70 +++++++++++++++++++---------- src/string_object.cpp | 4 ++ 4 files changed, 64 insertions(+), 43 deletions(-) diff --git a/include/natalie/encoding_object.hpp b/include/natalie/encoding_object.hpp index e6fc6d52f..c70e99938 100644 --- a/include/natalie/encoding_object.hpp +++ b/include/natalie/encoding_object.hpp @@ -95,6 +95,7 @@ class EncodingObject : public Object { EncodeInvalidOption invalid_option = EncodeInvalidOption::Raise; EncodeNewlineOption newline_option = EncodeNewlineOption::None; StringObject *replace_option = nullptr; + Value fallback_option = nullptr; }; virtual Value encode(Env *, EncodingObject *, StringObject *, EncodeOptions) const; diff --git a/spec/core/string/encode_spec.rb b/spec/core/string/encode_spec.rb index e141427de..8449f0a72 100644 --- a/spec/core/string/encode_spec.rb +++ b/spec/core/string/encode_spec.rb @@ -92,21 +92,19 @@ end it "replace multiple invalid bytes at the end with a single replacement character" do - NATFIXME 'encode options' do + NATFIXME 'encoding from utf-8 to utf-8 with invalid chars' do "\xE3\x81\x93\xE3\x81".encode("UTF-8", invalid: :replace).should == "\u3053\ufffd" end end it "replaces invalid encoding in source using a specified replacement even when a fallback is given" do - NATFIXME 'encode options' do - encoded = "ち\xE3\x81\xFF".encode("UTF-16LE", invalid: :replace, replace: "foo", fallback: -> c { "bar" }) - encoded.should == "\u3061foofoo".encode("UTF-16LE") - encoded.encode("UTF-8").should == "ちfoofoo" - end + encoded = "ち\xE3\x81\xFF".encode("UTF-16LE", invalid: :replace, replace: "foo", fallback: -> c { "bar" }) + encoded.should == "\u3061foofoo".encode("UTF-16LE") + encoded.encode("UTF-8").should == "ちfoofoo" end it "replaces undefined encoding in destination with default replacement" do - NATFIXME 'encode options' do + NATFIXME 'undef option' do encoded = "B\ufffd".encode(Encoding::US_ASCII, undef: :replace) encoded.should == "B?".encode(Encoding::US_ASCII) encoded.encode("UTF-8").should == "B?" @@ -114,7 +112,7 @@ end it "replaces undefined encoding in destination with a specified replacement" do - NATFIXME 'encode options' do + NATFIXME 'undef option' do encoded = "B\ufffd".encode(Encoding::US_ASCII, undef: :replace, replace: "foo") encoded.should == "Bfoo".encode(Encoding::US_ASCII) encoded.encode("UTF-8").should == "Bfoo" @@ -122,7 +120,7 @@ end it "replaces undefined encoding in destination with a specified replacement even if a fallback is given" do - NATFIXME 'encode options' do + NATFIXME 'undef option' do encoded = "B\ufffd".encode(Encoding::US_ASCII, undef: :replace, replace: "foo", fallback: proc {|x| "bar"}) encoded.should == "Bfoo".encode(Encoding::US_ASCII) encoded.encode("UTF-8").should == "Bfoo" @@ -130,19 +128,15 @@ end it "replaces undefined encoding in destination using a fallback proc" do - NATFIXME 'encode fallback' do - encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: proc {|x| "bar"}) - encoded.should == "Bbar".encode(Encoding::US_ASCII) - encoded.encode("UTF-8").should == "Bbar" - end + encoded = "B\ufffd".encode(Encoding::US_ASCII, fallback: proc {|x| "bar"}) + encoded.should == "Bbar".encode(Encoding::US_ASCII) + encoded.encode("UTF-8").should == "Bbar" end it "replaces invalid encoding in source using replace even when fallback is given as proc" do - NATFIXME 'encode options' do - encoded = "ち\xE3\x81\xFF".encode("UTF-16LE", invalid: :replace, replace: "foo", fallback: proc {|x| "bar"}) - encoded.should == "\u3061foofoo".encode("UTF-16LE") - encoded.encode("UTF-8").should == "ちfoofoo" - end + encoded = "ち\xE3\x81\xFF".encode("UTF-16LE", invalid: :replace, replace: "foo", fallback: proc {|x| "bar"}) + encoded.should == "\u3061foofoo".encode("UTF-16LE") + encoded.encode("UTF-8").should == "ちfoofoo" end end diff --git a/src/encoding_object.cpp b/src/encoding_object.cpp index 8faa03b52..8bbe38972 100644 --- a/src/encoding_object.cpp +++ b/src/encoding_object.cpp @@ -54,18 +54,39 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje auto source_codepoint = valid ? c : -1; - nat_int_t unicode_codepoint; - if (source_codepoint == -1) { + constexpr nat_int_t INVALID_CONTINUE = -1; + constexpr nat_int_t INVALID_RAISE = -2; + auto handle_invalid = [&](nat_int_t cpt) -> nat_int_t { switch (options.invalid_option) { case EncodeInvalidOption::Raise: - env->raise_invalid_byte_sequence_error(this); + if (options.fallback_option) { + Value result; + if (options.fallback_option->respond_to(env, "[]"_s)) { + result = options.fallback_option->send(env, "[]"_s, { Value::integer(c) }); + } else if (options.fallback_option->respond_to(env, "call"_s)) { + result = options.fallback_option->send(env, "call"_s, { Value::integer(c) }); + } + temp_string.append(result->to_s(env)); + return INVALID_CONTINUE; + } + return INVALID_RAISE; case EncodeInvalidOption::Replace: if (options.replace_option) { temp_string.append(options.replace_option); - continue; + return INVALID_CONTINUE; } - unicode_codepoint = 0xFFFD; + return 0xFFFD; } + NAT_UNREACHABLE(); + }; + + nat_int_t unicode_codepoint; + if (source_codepoint == -1) { + unicode_codepoint = handle_invalid(source_codepoint); + if (unicode_codepoint == INVALID_CONTINUE) + continue; + if (unicode_codepoint == INVALID_RAISE) + env->raise_invalid_byte_sequence_error(this); } else { unicode_codepoint = orig_encoding->to_unicode_codepoint(source_codepoint); } @@ -73,7 +94,6 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje // handle error if (unicode_codepoint < 0) { StringObject *message; - if (num() != Encoding::UTF_8) { // Example: "\x8F" to UTF-8 in conversion from ASCII-8BIT to UTF-8 to US-ASCII message = StringObject::format( @@ -87,7 +107,6 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje String::hex(source_codepoint, String::HexFormat::Uppercase), orig_encoding->name()); } - env->raise(EncodingClass->const_find(env, "UndefinedConversionError"_s)->as_class(), message); } @@ -95,24 +114,27 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje // handle error if (destination_codepoint < 0) { - StringObject *message; - - if (orig_encoding->num() != Encoding::UTF_8) - message = StringObject::format( - // Example: "U+043F to WINDOWS-1252 in conversion from Windows-1251 to UTF-8 to WINDOWS-1252", - "U+{} to {} in conversion from {} to UTF-8 to {}", - String::hex(source_codepoint, String::HexFormat::Uppercase), - name(), - orig_encoding->name(), - name()); - else { - // Example: U+0439 from UTF-8 to ASCII-8BIT - auto hex = String(); - hex.append_sprintf("%04X", source_codepoint); - message = StringObject::format("U+{} from UTF-8 to {}", hex, name()); + destination_codepoint = handle_invalid(source_codepoint); + if (destination_codepoint == INVALID_CONTINUE) + continue; + if (destination_codepoint == INVALID_RAISE) { + StringObject *message; + if (orig_encoding->num() != Encoding::UTF_8) + message = StringObject::format( + // Example: "U+043F to WINDOWS-1252 in conversion from Windows-1251 to UTF-8 to WINDOWS-1252", + "U+{} to {} in conversion from {} to UTF-8 to {}", + String::hex(source_codepoint, String::HexFormat::Uppercase), + name(), + orig_encoding->name(), + name()); + else { + // Example: U+0439 from UTF-8 to ASCII-8BIT + auto hex = String(); + hex.append_sprintf("%04X", source_codepoint); + message = StringObject::format("U+{} from UTF-8 to {}", hex, name()); + } + env->raise(EncodingClass->const_find(env, "UndefinedConversionError"_s)->as_class(), message); } - - env->raise(EncodingClass->const_find(env, "UndefinedConversionError"_s)->as_class(), message); } auto destination_char_obj = encode_codepoint(destination_codepoint); diff --git a/src/string_object.cpp b/src/string_object.cpp index 093184788..d09713cbb 100644 --- a/src/string_object.cpp +++ b/src/string_object.cpp @@ -1157,6 +1157,10 @@ Value StringObject::encode_in_place(Env *env, Value dst_encoding, Value src_enco auto replace = kwargs->remove(env, "replace"_s); if (replace && !replace->is_nil()) options.replace_option = replace->as_string_or_raise(env)->encode(env, dst_encoding)->as_string_or_raise(env); + + auto fallback = kwargs->remove(env, "fallback"_s); + if (fallback && !fallback->is_nil()) + options.fallback_option = fallback; } env->ensure_no_extra_keywords(kwargs); From 8d122eebc46cb7c75dd2aad949a9a027675eb413 Mon Sep 17 00:00:00 2001 From: Tim Morgan Date: Sat, 22 Jun 2024 07:32:54 -0500 Subject: [PATCH 2/5] Add :undef option to String#encode --- include/natalie/encoding_object.hpp | 6 +++ include/natalie/string_object.hpp | 1 + spec/core/string/encode_spec.rb | 26 ++++------ spec/core/string/shared/encode.rb | 74 +++++++++++++---------------- src/encoding_object.cpp | 45 ++++++++++-------- src/string_object.cpp | 8 ++++ 6 files changed, 82 insertions(+), 78 deletions(-) diff --git a/include/natalie/encoding_object.hpp b/include/natalie/encoding_object.hpp index c70e99938..97b8ed0e3 100644 --- a/include/natalie/encoding_object.hpp +++ b/include/natalie/encoding_object.hpp @@ -84,6 +84,11 @@ class EncodingObject : public Object { Replace, }; + enum class EncodeUndefOption { + Raise, + Replace, + }; + enum class EncodeNewlineOption { None, Cr, @@ -93,6 +98,7 @@ class EncodingObject : public Object { struct EncodeOptions { EncodeInvalidOption invalid_option = EncodeInvalidOption::Raise; + EncodeUndefOption undef_option = EncodeUndefOption::Raise; EncodeNewlineOption newline_option = EncodeNewlineOption::None; StringObject *replace_option = nullptr; Value fallback_option = nullptr; diff --git a/include/natalie/string_object.hpp b/include/natalie/string_object.hpp index 641d4e6c3..7dd434524 100644 --- a/include/natalie/string_object.hpp +++ b/include/natalie/string_object.hpp @@ -511,6 +511,7 @@ class StringObject : public Object { using EncodeOptions = EncodingObject::EncodeOptions; using EncodeInvalidOption = EncodingObject::EncodeInvalidOption; using EncodeNewlineOption = EncodingObject::EncodeNewlineOption; + using EncodeUndefOption = EncodingObject::EncodeUndefOption; String m_string {}; EncodingObject *m_encoding { nullptr }; diff --git a/spec/core/string/encode_spec.rb b/spec/core/string/encode_spec.rb index 8449f0a72..8ba7116ff 100644 --- a/spec/core/string/encode_spec.rb +++ b/spec/core/string/encode_spec.rb @@ -112,19 +112,15 @@ end it "replaces undefined encoding in destination with a specified replacement" do - NATFIXME 'undef option' do - encoded = "B\ufffd".encode(Encoding::US_ASCII, undef: :replace, replace: "foo") - encoded.should == "Bfoo".encode(Encoding::US_ASCII) - encoded.encode("UTF-8").should == "Bfoo" - end + encoded = "B\ufffd".encode(Encoding::US_ASCII, undef: :replace, replace: "foo") + encoded.should == "Bfoo".encode(Encoding::US_ASCII) + encoded.encode("UTF-8").should == "Bfoo" end it "replaces undefined encoding in destination with a specified replacement even if a fallback is given" do - NATFIXME 'undef option' do - encoded = "B\ufffd".encode(Encoding::US_ASCII, undef: :replace, replace: "foo", fallback: proc {|x| "bar"}) - encoded.should == "Bfoo".encode(Encoding::US_ASCII) - encoded.encode("UTF-8").should == "Bfoo" - end + encoded = "B\ufffd".encode(Encoding::US_ASCII, undef: :replace, replace: "foo", fallback: proc {|x| "bar"}) + encoded.should == "Bfoo".encode(Encoding::US_ASCII) + encoded.encode("UTF-8").should == "Bfoo" end it "replaces undefined encoding in destination using a fallback proc" do @@ -162,12 +158,10 @@ describe "when passed to, options" do it "returns a copy when the destination encoding is the same as the String encoding" do - NATFIXME 'encode options' do - str = "あ" - encoded = str.encode(Encoding::UTF_8, undef: :replace) - encoded.should_not equal(str) - encoded.should == str - end + str = "あ" + encoded = str.encode(Encoding::UTF_8, undef: :replace) + encoded.should_not equal(str) + encoded.should == str end end diff --git a/spec/core/string/shared/encode.rb b/spec/core/string/shared/encode.rb index fce411b02..b099beec9 100644 --- a/spec/core/string/shared/encode.rb +++ b/spec/core/string/shared/encode.rb @@ -88,20 +88,16 @@ describe "when passed options" do it "does not process transcoding options if not transcoding" do - NATFIXME 'encode options' do - result = "あ\ufffdあ".send(@method, undef: :replace) - result.should == "あ\ufffdあ" - end + result = "あ\ufffdあ".send(@method, undef: :replace) + result.should == "あ\ufffdあ" end it "calls #to_hash to convert the object" do - NATFIXME 'encode options' do - options = mock("string encode options") - options.should_receive(:to_hash).and_return({ undef: :replace }) + options = mock("string encode options") + options.should_receive(:to_hash).and_return({ undef: :replace }) - result = "あ\ufffdあ".send(@method, **options) - result.should == "あ\ufffdあ" - end + result = "あ\ufffdあ".send(@method, **options) + result.should == "あ\ufffdあ" end it "transcodes to Encoding.default_internal when set" do @@ -111,7 +107,7 @@ end it "raises an Encoding::ConverterNotFoundError when no conversion is possible despite 'invalid: :replace, undef: :replace'" do - NATFIXME 'encode options' do + NATFIXME 'undef option' do Encoding.default_internal = Encoding::Emacs_Mule str = [0x80].pack('C').force_encoding Encoding::BINARY -> do @@ -121,7 +117,7 @@ end it "replaces invalid characters when replacing Emacs-Mule encoded strings" do - NATFIXME 'encode options' do + NATFIXME 'Emacs-Mule' do got = [0x80].pack('C').force_encoding('Emacs-Mule').send(@method, invalid: :replace) got.should == "?".encode('Emacs-Mule') @@ -155,12 +151,10 @@ describe "when passed to, options" do it "replaces undefined characters in the destination encoding" do - NATFIXME 'encode options' do - result = "あ?あ".send(@method, Encoding::EUC_JP, undef: :replace) - # testing for: "\xA4\xA2?\xA4\xA2" - xA4xA2 = [0xA4, 0xA2].pack('CC') - result.should == "#{xA4xA2}?#{xA4xA2}".force_encoding("euc-jp") - end + result = "あ?あ".send(@method, Encoding::EUC_JP, undef: :replace) + # testing for: "\xA4\xA2?\xA4\xA2" + xA4xA2 = [0xA4, 0xA2].pack('CC') + result.should == "#{xA4xA2}?#{xA4xA2}".force_encoding("euc-jp") end it "replaces invalid characters in the destination encoding" do @@ -171,20 +165,18 @@ end it "calls #to_hash to convert the options object" do - NATFIXME 'encode options' do - options = mock("string encode options") - options.should_receive(:to_hash).and_return({ undef: :replace }) + options = mock("string encode options") + options.should_receive(:to_hash).and_return({ undef: :replace }) - result = "あ?あ".send(@method, Encoding::EUC_JP, **options) - xA4xA2 = [0xA4, 0xA2].pack('CC').force_encoding('utf-8') - result.should == "#{xA4xA2}?#{xA4xA2}".force_encoding("euc-jp") - end + result = "あ?あ".send(@method, Encoding::EUC_JP, **options) + xA4xA2 = [0xA4, 0xA2].pack('CC').force_encoding('utf-8') + result.should == "#{xA4xA2}?#{xA4xA2}".force_encoding("euc-jp") end end describe "when passed to, from, options" do it "replaces undefined characters in the destination encoding" do - NATFIXME 'encode options' do + NATFIXME 'undef option and src encoding' do str = "あ?あ".force_encoding Encoding::BINARY result = str.send(@method, "euc-jp", "utf-8", undef: :replace) xA4xA2 = [0xA4, 0xA2].pack('CC').force_encoding('utf-8') @@ -193,7 +185,7 @@ end it "replaces invalid characters in the destination encoding" do - NATFIXME 'encode options' do + NATFIXME 'src encoding' do xFF = [0xFF].pack('C').force_encoding('utf-8') str = "ab#{xFF}c".force_encoding Encoding::BINARY str.send(@method, "iso-8859-1", "utf-8", invalid: :replace).should == "ab?c" @@ -201,7 +193,7 @@ end it "calls #to_str to convert the to object to an encoding" do - NATFIXME 'encode options' do + NATFIXME 'src encoding' do to = mock("string encode to encoding") to.should_receive(:to_str).and_return("iso-8859-1") @@ -212,7 +204,7 @@ end it "calls #to_str to convert the from object to an encoding" do - NATFIXME 'encode options' do + NATFIXME 'src encoding' do from = mock("string encode to encoding") from.should_receive(:to_str).and_return("utf-8") @@ -223,7 +215,7 @@ end it "calls #to_hash to convert the options object" do - NATFIXME 'encode options' do + NATFIXME 'keyword splat should call to_hash?' do options = mock("string encode options") options.should_receive(:to_hash).and_return({ invalid: :replace }) @@ -236,31 +228,31 @@ describe "given the xml: :text option" do it "replaces all instances of '&' with '&'" do - NATFIXME 'encode options' do + NATFIXME 'xml option' do '& and &'.send(@method, "UTF-8", xml: :text).should == '& and &' end end it "replaces all instances of '<' with '<'" do - NATFIXME 'encode options' do + NATFIXME 'xml option' do '< and <'.send(@method, "UTF-8", xml: :text).should == '< and <' end end it "replaces all instances of '>' with '>'" do - NATFIXME 'encode options' do + NATFIXME 'xml option' do '> and >'.send(@method, "UTF-8", xml: :text).should == '> and >' end end it "does not replace '\"'" do - NATFIXME 'encode options' do + NATFIXME 'xml option' do '" and "'.send(@method, "UTF-8", xml: :text).should == '" and "' end end it "replaces undefined characters with their upper-case hexadecimal numeric character references" do - NATFIXME 'encode options' do + NATFIXME 'xml option' do 'ürst'.send(@method, Encoding::US_ASCII, xml: :text).should == 'ürst' end end @@ -268,37 +260,37 @@ describe "given the xml: :attr option" do it "surrounds the encoded text with double-quotes" do - NATFIXME 'encode options' do + NATFIXME 'xml option' do 'abc'.send(@method, "UTF-8", xml: :attr).should == '"abc"' end end it "replaces all instances of '&' with '&'" do - NATFIXME 'encode options' do + NATFIXME 'xml option' do '& and &'.send(@method, "UTF-8", xml: :attr).should == '"& and &"' end end it "replaces all instances of '<' with '<'" do - NATFIXME 'encode options' do + NATFIXME 'xml option' do '< and <'.send(@method, "UTF-8", xml: :attr).should == '"< and <"' end end it "replaces all instances of '>' with '>'" do - NATFIXME 'encode options' do + NATFIXME 'xml option' do '> and >'.send(@method, "UTF-8", xml: :attr).should == '"> and >"' end end it "replaces all instances of '\"' with '"'" do - NATFIXME 'encode options' do + NATFIXME 'xml option' do '" and "'.send(@method, "UTF-8", xml: :attr).should == '"" and ""' end end it "replaces undefined characters with their upper-case hexadecimal numeric character references" do - NATFIXME 'encode options' do + NATFIXME 'xml option' do 'ürst'.send(@method, Encoding::US_ASCII, xml: :attr).should == '"ürst"' end end diff --git a/src/encoding_object.cpp b/src/encoding_object.cpp index 8bbe38972..2b8f0b991 100644 --- a/src/encoding_object.cpp +++ b/src/encoding_object.cpp @@ -54,9 +54,8 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje auto source_codepoint = valid ? c : -1; - constexpr nat_int_t INVALID_CONTINUE = -1; - constexpr nat_int_t INVALID_RAISE = -2; - auto handle_invalid = [&](nat_int_t cpt) -> nat_int_t { + nat_int_t unicode_codepoint; + if (source_codepoint == -1) { switch (options.invalid_option) { case EncodeInvalidOption::Raise: if (options.fallback_option) { @@ -67,26 +66,16 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje result = options.fallback_option->send(env, "call"_s, { Value::integer(c) }); } temp_string.append(result->to_s(env)); - return INVALID_CONTINUE; + continue; } - return INVALID_RAISE; + env->raise_invalid_byte_sequence_error(this); case EncodeInvalidOption::Replace: if (options.replace_option) { temp_string.append(options.replace_option); - return INVALID_CONTINUE; + continue; } - return 0xFFFD; + unicode_codepoint = 0xFFFD; } - NAT_UNREACHABLE(); - }; - - nat_int_t unicode_codepoint; - if (source_codepoint == -1) { - unicode_codepoint = handle_invalid(source_codepoint); - if (unicode_codepoint == INVALID_CONTINUE) - continue; - if (unicode_codepoint == INVALID_RAISE) - env->raise_invalid_byte_sequence_error(this); } else { unicode_codepoint = orig_encoding->to_unicode_codepoint(source_codepoint); } @@ -114,10 +103,18 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje // handle error if (destination_codepoint < 0) { - destination_codepoint = handle_invalid(source_codepoint); - if (destination_codepoint == INVALID_CONTINUE) - continue; - if (destination_codepoint == INVALID_RAISE) { + switch (options.undef_option) { + case EncodeUndefOption::Raise: + if (options.fallback_option) { + Value result; + if (options.fallback_option->respond_to(env, "[]"_s)) { + result = options.fallback_option->send(env, "[]"_s, { Value::integer(c) }); + } else if (options.fallback_option->respond_to(env, "call"_s)) { + result = options.fallback_option->send(env, "call"_s, { Value::integer(c) }); + } + temp_string.append(result->to_s(env)); + continue; + } StringObject *message; if (orig_encoding->num() != Encoding::UTF_8) message = StringObject::format( @@ -134,6 +131,12 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje message = StringObject::format("U+{} from UTF-8 to {}", hex, name()); } env->raise(EncodingClass->const_find(env, "UndefinedConversionError"_s)->as_class(), message); + case EncodeUndefOption::Replace: + if (options.replace_option) { + temp_string.append(options.replace_option); + continue; + } + destination_codepoint = 0xFFFD; } } diff --git a/src/string_object.cpp b/src/string_object.cpp index d09713cbb..971c7b58b 100644 --- a/src/string_object.cpp +++ b/src/string_object.cpp @@ -1154,6 +1154,14 @@ Value StringObject::encode_in_place(Env *env, Value dst_encoding, Value src_enco options.invalid_option = EncodeInvalidOption::Replace; } + auto undef = kwargs->remove(env, "undef"_s); + if (undef) { + if (undef->is_nil()) + options.undef_option = EncodeUndefOption::Raise; + else if (undef == "replace"_s) + options.undef_option = EncodeUndefOption::Replace; + } + auto replace = kwargs->remove(env, "replace"_s); if (replace && !replace->is_nil()) options.replace_option = replace->as_string_or_raise(env)->encode(env, dst_encoding)->as_string_or_raise(env); From 4365e7b2ddcccc56ece7c86bfbca9f9275323fba Mon Sep 17 00:00:00 2001 From: Tim Morgan Date: Sat, 22 Jun 2024 07:36:32 -0500 Subject: [PATCH 3/5] DRY up fallback handling in string encode --- src/encoding_object.cpp | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/encoding_object.cpp b/src/encoding_object.cpp index 2b8f0b991..8c038ab25 100644 --- a/src/encoding_object.cpp +++ b/src/encoding_object.cpp @@ -52,6 +52,16 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje break; } + auto handle_fallback = [&](nat_int_t cpt) { + Value result; + if (options.fallback_option->respond_to(env, "[]"_s)) { + result = options.fallback_option->send(env, "[]"_s, { Value::integer(cpt) }); + } else if (options.fallback_option->respond_to(env, "call"_s)) { + result = options.fallback_option->send(env, "call"_s, { Value::integer(cpt) }); + } + temp_string.append(result->to_s(env)); + }; + auto source_codepoint = valid ? c : -1; nat_int_t unicode_codepoint; @@ -59,13 +69,7 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje switch (options.invalid_option) { case EncodeInvalidOption::Raise: if (options.fallback_option) { - Value result; - if (options.fallback_option->respond_to(env, "[]"_s)) { - result = options.fallback_option->send(env, "[]"_s, { Value::integer(c) }); - } else if (options.fallback_option->respond_to(env, "call"_s)) { - result = options.fallback_option->send(env, "call"_s, { Value::integer(c) }); - } - temp_string.append(result->to_s(env)); + handle_fallback(c); continue; } env->raise_invalid_byte_sequence_error(this); @@ -106,13 +110,7 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje switch (options.undef_option) { case EncodeUndefOption::Raise: if (options.fallback_option) { - Value result; - if (options.fallback_option->respond_to(env, "[]"_s)) { - result = options.fallback_option->send(env, "[]"_s, { Value::integer(c) }); - } else if (options.fallback_option->respond_to(env, "call"_s)) { - result = options.fallback_option->send(env, "call"_s, { Value::integer(c) }); - } - temp_string.append(result->to_s(env)); + handle_fallback(unicode_codepoint); continue; } StringObject *message; From 183916453ed13a3ee1cebec1901096c92ace5747 Mon Sep 17 00:00:00 2001 From: Tim Morgan Date: Sat, 22 Jun 2024 07:38:54 -0500 Subject: [PATCH 4/5] Use '?' for replacement char sometimes --- spec/core/string/encode_spec.rb | 8 +++----- spec/core/string/shared/encode.rb | 6 ++---- src/encoding_object.cpp | 10 ++++++++-- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/spec/core/string/encode_spec.rb b/spec/core/string/encode_spec.rb index 8ba7116ff..9c2412106 100644 --- a/spec/core/string/encode_spec.rb +++ b/spec/core/string/encode_spec.rb @@ -104,11 +104,9 @@ end it "replaces undefined encoding in destination with default replacement" do - NATFIXME 'undef option' do - encoded = "B\ufffd".encode(Encoding::US_ASCII, undef: :replace) - encoded.should == "B?".encode(Encoding::US_ASCII) - encoded.encode("UTF-8").should == "B?" - end + encoded = "B\ufffd".encode(Encoding::US_ASCII, undef: :replace) + encoded.should == "B?".encode(Encoding::US_ASCII) + encoded.encode("UTF-8").should == "B?" end it "replaces undefined encoding in destination with a specified replacement" do diff --git a/spec/core/string/shared/encode.rb b/spec/core/string/shared/encode.rb index b099beec9..5d435f93c 100644 --- a/spec/core/string/shared/encode.rb +++ b/spec/core/string/shared/encode.rb @@ -158,10 +158,8 @@ end it "replaces invalid characters in the destination encoding" do - NATFIXME 'encode options' do - xFF = [0xFF].pack('C').force_encoding('utf-8') - "ab#{xFF}c".send(@method, Encoding::ISO_8859_1, invalid: :replace).should == "ab?c" - end + xFF = [0xFF].pack('C').force_encoding('utf-8') + "ab#{xFF}c".send(@method, Encoding::ISO_8859_1, invalid: :replace).should == "ab?c" end it "calls #to_hash to convert the options object" do diff --git a/src/encoding_object.cpp b/src/encoding_object.cpp index 8c038ab25..9c2439649 100644 --- a/src/encoding_object.cpp +++ b/src/encoding_object.cpp @@ -78,7 +78,10 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje temp_string.append(options.replace_option); continue; } - unicode_codepoint = 0xFFFD; + if (is_single_byte_encoding()) + unicode_codepoint = '?'; + else + unicode_codepoint = 0xFFFD; } } else { unicode_codepoint = orig_encoding->to_unicode_codepoint(source_codepoint); @@ -134,7 +137,10 @@ Value EncodingObject::encode(Env *env, EncodingObject *orig_encoding, StringObje temp_string.append(options.replace_option); continue; } - destination_codepoint = 0xFFFD; + if (is_single_byte_encoding()) + destination_codepoint = '?'; + else + destination_codepoint = 0xFFFD; } } From ee993699d7ca7f66615dfa3fd9b624404b5aabba Mon Sep 17 00:00:00 2001 From: Tim Morgan Date: Sat, 22 Jun 2024 07:54:31 -0500 Subject: [PATCH 5/5] Improve NATFIXME markers in failing specs --- spec/core/string/encode_spec.rb | 8 ++--- spec/core/string/shared/encode.rb | 50 +++++++++++++++---------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/spec/core/string/encode_spec.rb b/spec/core/string/encode_spec.rb index 9c2412106..66e66f107 100644 --- a/spec/core/string/encode_spec.rb +++ b/spec/core/string/encode_spec.rb @@ -92,7 +92,7 @@ end it "replace multiple invalid bytes at the end with a single replacement character" do - NATFIXME 'encoding from utf-8 to utf-8 with invalid chars' do + NATFIXME 'same encoding with invalid chars', exception: SpecFailedException, message: /should be ==/ do "\xE3\x81\x93\xE3\x81".encode("UTF-8", invalid: :replace).should == "\u3053\ufffd" end end @@ -136,7 +136,7 @@ describe "when passed to, from" do it "returns a copy in the destination encoding when both encodings are the same" do - NATFIXME 'src encoding' do + NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /from ASCII-8BIT to UTF-8/ do str = "あ".dup.force_encoding("binary") encoded = str.encode("utf-8", "utf-8") @@ -147,7 +147,7 @@ end it "returns the transcoded string" do - NATFIXME 'not sure' do + NATFIXME 'honor source encoding', exception: SpecFailedException, message: /should be ==/ do str = "\x00\x00\x00\x1F" str.encode(Encoding::UTF_8, Encoding::UTF_16BE).should == "\u0000\u001f" end @@ -172,7 +172,7 @@ end it "returns a copy in the destination encoding when both encodings are the same" do - NATFIXME 'encode options' do + NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /from ASCII-8BIT to UTF-8/ do str = "あ".dup.force_encoding("binary") encoded = str.encode("utf-8", "utf-8", invalid: :replace) diff --git a/spec/core/string/shared/encode.rb b/spec/core/string/shared/encode.rb index 5d435f93c..096e5a30e 100644 --- a/spec/core/string/shared/encode.rb +++ b/spec/core/string/shared/encode.rb @@ -9,7 +9,7 @@ end it "transcodes a 7-bit String despite no generic converting being available" do - NATFIXME 'converter not found' do + NATFIXME 'need Emacs_Mule encoding', exception: SpecFailedException, message: /uninitialized constant Encoding::Emacs_Mule/ do -> do Encoding::Converter.new Encoding::Emacs_Mule, Encoding::BINARY end.should raise_error(Encoding::ConverterNotFoundError) @@ -22,7 +22,7 @@ end it "raises an Encoding::ConverterNotFoundError when no conversion is possible" do - NATFIXME 'no conversion' do + NATFIXME 'need Emacs_Mule encoding', exception: NameError, message: /uninitialized constant Encoding::Emacs_Mule/ do Encoding.default_internal = Encoding::Emacs_Mule str = [0x80].pack('C').force_encoding Encoding::BINARY -> { str.send(@method) }.should raise_error(Encoding::ConverterNotFoundError) @@ -50,7 +50,7 @@ end it "transcodes Japanese multibyte characters" do - NATFIXME 'not sure' do + NATFIXME 'need ISO_2022_JP encoding', exception: NameError, message: /uninitialized constant Encoding::ISO_2022_JP/ do str = "あいうえお" str.send(@method, Encoding::ISO_2022_JP).should == "\e\x24\x42\x24\x22\x24\x24\x24\x26\x24\x28\x24\x2A\e\x28\x42".force_encoding(Encoding::ISO_2022_JP) @@ -58,7 +58,7 @@ end it "transcodes a 7-bit String despite no generic converting being available" do - NATFIXME 'converter not found' do + NATFIXME 'need Emacs_Mule encoding', exception: SpecFailedException, message: /uninitialized constant Encoding::Emacs_Mule/ do -> do Encoding::Converter.new Encoding::Emacs_Mule, Encoding::BINARY end.should raise_error(Encoding::ConverterNotFoundError) @@ -69,7 +69,7 @@ end it "raises an Encoding::ConverterNotFoundError when no conversion is possible" do - NATFIXME 'converter not found' do + NATFIXME 'need Emacs_Mule encoding', exception: SpecFailedException, message: /uninitialized constant Encoding::Emacs_Mule/ do str = [0x80].pack('C').force_encoding Encoding::BINARY -> do str.send(@method, Encoding::Emacs_Mule) @@ -78,7 +78,7 @@ end it "raises an Encoding::ConverterNotFoundError for an invalid encoding" do - NATFIXME 'converter not found' do + NATFIXME 'wrong error', exception: SpecFailedException, message: /ConverterNotFoundError.*but instead.*ArgumentError/ do -> do "abc".send(@method, "xyz") end.should raise_error(Encoding::ConverterNotFoundError) @@ -107,7 +107,7 @@ end it "raises an Encoding::ConverterNotFoundError when no conversion is possible despite 'invalid: :replace, undef: :replace'" do - NATFIXME 'undef option' do + NATFIXME 'need Emacs_Mule encoding', exception: NameError, message: /uninitialized constant Encoding::Emacs_Mule/ do Encoding.default_internal = Encoding::Emacs_Mule str = [0x80].pack('C').force_encoding Encoding::BINARY -> do @@ -117,7 +117,7 @@ end it "replaces invalid characters when replacing Emacs-Mule encoded strings" do - NATFIXME 'Emacs-Mule' do + NATFIXME 'need Emacs_Mule encoding', exception: ArgumentError, message: 'unknown encoding name - "Emacs-Mule"' do got = [0x80].pack('C').force_encoding('Emacs-Mule').send(@method, invalid: :replace) got.should == "?".encode('Emacs-Mule') @@ -127,7 +127,7 @@ describe "when passed to, from" do it "transcodes between the encodings ignoring the String encoding" do - NATFIXME 'src encoding' do + NATFIXME 'honor source encoding', exception: SpecFailedException, message: /should be ==/ do str = "あ" result = [0xA6, 0xD0, 0x8F, 0xAB, 0xE4, 0x8F, 0xAB, 0xB1].pack('C8') result.force_encoding Encoding::EUC_JP @@ -136,7 +136,7 @@ end it "calls #to_str to convert the from object to an Encoding" do - NATFIXME 'src encoding' do + NATFIXME 'honor source encoding', exception: SpecFailedException, message: /should be ==/ do enc = mock("string encode encoding") enc.should_receive(:to_str).and_return("ibm437") @@ -174,7 +174,7 @@ describe "when passed to, from, options" do it "replaces undefined characters in the destination encoding" do - NATFIXME 'undef option and src encoding' do + NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /to UTF-8 in conversion from ASCII-8BIT to UTF-8 to EUC-JP/ do str = "あ?あ".force_encoding Encoding::BINARY result = str.send(@method, "euc-jp", "utf-8", undef: :replace) xA4xA2 = [0xA4, 0xA2].pack('CC').force_encoding('utf-8') @@ -183,7 +183,7 @@ end it "replaces invalid characters in the destination encoding" do - NATFIXME 'src encoding' do + NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /to UTF-8 in conversion from ASCII-8BIT to UTF-8 to ISO-8859-1/ do xFF = [0xFF].pack('C').force_encoding('utf-8') str = "ab#{xFF}c".force_encoding Encoding::BINARY str.send(@method, "iso-8859-1", "utf-8", invalid: :replace).should == "ab?c" @@ -191,7 +191,7 @@ end it "calls #to_str to convert the to object to an encoding" do - NATFIXME 'src encoding' do + NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /to UTF-8 in conversion from ASCII-8BIT to UTF-8 to ISO-8859-1/ do to = mock("string encode to encoding") to.should_receive(:to_str).and_return("iso-8859-1") @@ -202,7 +202,7 @@ end it "calls #to_str to convert the from object to an encoding" do - NATFIXME 'src encoding' do + NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /to UTF-8 in conversion from ASCII-8BIT to UTF-8 to ISO-8859-1/ do from = mock("string encode to encoding") from.should_receive(:to_str).and_return("utf-8") @@ -226,31 +226,31 @@ describe "given the xml: :text option" do it "replaces all instances of '&' with '&'" do - NATFIXME 'xml option' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do '& and &'.send(@method, "UTF-8", xml: :text).should == '& and &' end end it "replaces all instances of '<' with '<'" do - NATFIXME 'xml option' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do '< and <'.send(@method, "UTF-8", xml: :text).should == '< and <' end end it "replaces all instances of '>' with '>'" do - NATFIXME 'xml option' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do '> and >'.send(@method, "UTF-8", xml: :text).should == '> and >' end end it "does not replace '\"'" do - NATFIXME 'xml option' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do '" and "'.send(@method, "UTF-8", xml: :text).should == '" and "' end end it "replaces undefined characters with their upper-case hexadecimal numeric character references" do - NATFIXME 'xml option' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do 'ürst'.send(@method, Encoding::US_ASCII, xml: :text).should == 'ürst' end end @@ -258,37 +258,37 @@ describe "given the xml: :attr option" do it "surrounds the encoded text with double-quotes" do - NATFIXME 'xml option' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do 'abc'.send(@method, "UTF-8", xml: :attr).should == '"abc"' end end it "replaces all instances of '&' with '&'" do - NATFIXME 'xml option' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do '& and &'.send(@method, "UTF-8", xml: :attr).should == '"& and &"' end end it "replaces all instances of '<' with '<'" do - NATFIXME 'xml option' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do '< and <'.send(@method, "UTF-8", xml: :attr).should == '"< and <"' end end it "replaces all instances of '>' with '>'" do - NATFIXME 'xml option' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do '> and >'.send(@method, "UTF-8", xml: :attr).should == '"> and >"' end end it "replaces all instances of '\"' with '"'" do - NATFIXME 'xml option' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do '" and "'.send(@method, "UTF-8", xml: :attr).should == '"" and ""' end end it "replaces undefined characters with their upper-case hexadecimal numeric character references" do - NATFIXME 'xml option' do + NATFIXME 'xml option', exception: ArgumentError, message: 'unknown keyword: :xml' do 'ürst'.send(@method, Encoding::US_ASCII, xml: :attr).should == '"ürst"' end end