From 407f51bddd532a795adabd90ce76d57de6e52691 Mon Sep 17 00:00:00 2001 From: Tim Morgan Date: Tue, 25 Jun 2024 08:15:08 -0500 Subject: [PATCH 1/2] Add IBM437 -> Unicode map --- .../encoding/ibm437_encoding_object.hpp | 5 + src/encoding/ibm437_encoding_object.cpp | 141 +++++++++++++++++- 2 files changed, 145 insertions(+), 1 deletion(-) diff --git a/include/natalie/encoding/ibm437_encoding_object.hpp b/include/natalie/encoding/ibm437_encoding_object.hpp index 6f269b89d..2e5da5113 100644 --- a/include/natalie/encoding/ibm437_encoding_object.hpp +++ b/include/natalie/encoding/ibm437_encoding_object.hpp @@ -35,6 +35,11 @@ class Ibm437EncodingObject : public EncodingObject { virtual nat_int_t decode_codepoint(StringView &str) const override; virtual bool is_single_byte_encoding() const override final { return true; } + +private: + static const TM::Hashmap &to_unicode_map(); + + static inline TM::Hashmap s_to_unicode_map {}; }; } diff --git a/src/encoding/ibm437_encoding_object.cpp b/src/encoding/ibm437_encoding_object.cpp index 9678a52c0..0d59ed187 100644 --- a/src/encoding/ibm437_encoding_object.cpp +++ b/src/encoding/ibm437_encoding_object.cpp @@ -26,7 +26,12 @@ String Ibm437EncodingObject::escaped_char(const nat_int_t c) const { nat_int_t Ibm437EncodingObject::to_unicode_codepoint(nat_int_t codepoint) const { if (codepoint >= 0x00 && codepoint <= 0x7F) return codepoint; - NAT_NOT_YET_IMPLEMENTED("Conversion above Unicode Basic Latin (0x00..0x7F) not implemented"); + + auto result = to_unicode_map().get(codepoint); + if (result == 0) + return -1; + + return result; } nat_int_t Ibm437EncodingObject::from_unicode_codepoint(nat_int_t codepoint) const { @@ -48,4 +53,138 @@ nat_int_t Ibm437EncodingObject::decode_codepoint(StringView &str) const { } } +const TM::Hashmap &Ibm437EncodingObject::to_unicode_map() { + if (s_to_unicode_map.is_empty()) { + s_to_unicode_map.put(0x80, 0xC7); + s_to_unicode_map.put(0x81, 0xFC); + s_to_unicode_map.put(0x82, 0xE9); + s_to_unicode_map.put(0x83, 0xE2); + s_to_unicode_map.put(0x84, 0xE4); + s_to_unicode_map.put(0x85, 0xE0); + s_to_unicode_map.put(0x86, 0xE5); + s_to_unicode_map.put(0x87, 0xE7); + s_to_unicode_map.put(0x88, 0xEA); + s_to_unicode_map.put(0x89, 0xEB); + s_to_unicode_map.put(0x8A, 0xE8); + s_to_unicode_map.put(0x8B, 0xEF); + s_to_unicode_map.put(0x8C, 0xEE); + s_to_unicode_map.put(0x8D, 0xEC); + s_to_unicode_map.put(0x8E, 0xC4); + s_to_unicode_map.put(0x8F, 0xC5); + s_to_unicode_map.put(0x90, 0xC9); + s_to_unicode_map.put(0x91, 0xE6); + s_to_unicode_map.put(0x92, 0xC6); + s_to_unicode_map.put(0x93, 0xF4); + s_to_unicode_map.put(0x94, 0xF6); + s_to_unicode_map.put(0x95, 0xF2); + s_to_unicode_map.put(0x96, 0xFB); + s_to_unicode_map.put(0x97, 0xF9); + s_to_unicode_map.put(0x98, 0xFF); + s_to_unicode_map.put(0x99, 0xD6); + s_to_unicode_map.put(0x9A, 0xDC); + s_to_unicode_map.put(0x9B, 0xA2); + s_to_unicode_map.put(0x9C, 0xA3); + s_to_unicode_map.put(0x9D, 0xA5); + s_to_unicode_map.put(0x9E, 0x20A7); + s_to_unicode_map.put(0x9F, 0x192); + s_to_unicode_map.put(0xA0, 0xE1); + s_to_unicode_map.put(0xA1, 0xED); + s_to_unicode_map.put(0xA2, 0xF3); + s_to_unicode_map.put(0xA3, 0xFA); + s_to_unicode_map.put(0xA4, 0xF1); + s_to_unicode_map.put(0xA5, 0xD1); + s_to_unicode_map.put(0xA6, 0xAA); + s_to_unicode_map.put(0xA7, 0xBA); + s_to_unicode_map.put(0xA8, 0xBF); + s_to_unicode_map.put(0xA9, 0x2310); + s_to_unicode_map.put(0xAA, 0xAC); + s_to_unicode_map.put(0xAB, 0xBD); + s_to_unicode_map.put(0xAC, 0xBC); + s_to_unicode_map.put(0xAD, 0xA1); + s_to_unicode_map.put(0xAE, 0xAB); + s_to_unicode_map.put(0xAF, 0xBB); + s_to_unicode_map.put(0xB0, 0x2591); + s_to_unicode_map.put(0xB1, 0x2592); + s_to_unicode_map.put(0xB2, 0x2593); + s_to_unicode_map.put(0xB3, 0x2502); + s_to_unicode_map.put(0xB4, 0x2524); + s_to_unicode_map.put(0xB5, 0x2561); + s_to_unicode_map.put(0xB6, 0x2562); + s_to_unicode_map.put(0xB7, 0x2556); + s_to_unicode_map.put(0xB8, 0x2555); + s_to_unicode_map.put(0xB9, 0x2563); + s_to_unicode_map.put(0xBA, 0x2551); + s_to_unicode_map.put(0xBB, 0x2557); + s_to_unicode_map.put(0xBC, 0x255D); + s_to_unicode_map.put(0xBD, 0x255C); + s_to_unicode_map.put(0xBE, 0x255B); + s_to_unicode_map.put(0xBF, 0x2510); + s_to_unicode_map.put(0xC0, 0x2514); + s_to_unicode_map.put(0xC1, 0x2534); + s_to_unicode_map.put(0xC2, 0x252C); + s_to_unicode_map.put(0xC3, 0x251C); + s_to_unicode_map.put(0xC4, 0x2500); + s_to_unicode_map.put(0xC5, 0x253C); + s_to_unicode_map.put(0xC6, 0x255E); + s_to_unicode_map.put(0xC7, 0x255F); + s_to_unicode_map.put(0xC8, 0x255A); + s_to_unicode_map.put(0xC9, 0x2554); + s_to_unicode_map.put(0xCA, 0x2569); + s_to_unicode_map.put(0xCB, 0x2566); + s_to_unicode_map.put(0xCC, 0x2560); + s_to_unicode_map.put(0xCD, 0x2550); + s_to_unicode_map.put(0xCE, 0x256C); + s_to_unicode_map.put(0xCF, 0x2567); + s_to_unicode_map.put(0xD0, 0x2568); + s_to_unicode_map.put(0xD1, 0x2564); + s_to_unicode_map.put(0xD2, 0x2565); + s_to_unicode_map.put(0xD3, 0x2559); + s_to_unicode_map.put(0xD4, 0x2558); + s_to_unicode_map.put(0xD5, 0x2552); + s_to_unicode_map.put(0xD6, 0x2553); + s_to_unicode_map.put(0xD7, 0x256B); + s_to_unicode_map.put(0xD8, 0x256A); + s_to_unicode_map.put(0xD9, 0x2518); + s_to_unicode_map.put(0xDA, 0x250C); + s_to_unicode_map.put(0xDB, 0x2588); + s_to_unicode_map.put(0xDC, 0x2584); + s_to_unicode_map.put(0xDD, 0x258C); + s_to_unicode_map.put(0xDE, 0x2590); + s_to_unicode_map.put(0xDF, 0x2580); + s_to_unicode_map.put(0xE0, 0x3B1); + s_to_unicode_map.put(0xE1, 0xDF); + s_to_unicode_map.put(0xE2, 0x393); + s_to_unicode_map.put(0xE3, 0x3C0); + s_to_unicode_map.put(0xE4, 0x3A3); + s_to_unicode_map.put(0xE5, 0x3C3); + s_to_unicode_map.put(0xE6, 0xB5); + s_to_unicode_map.put(0xE7, 0x3C4); + s_to_unicode_map.put(0xE8, 0x3A6); + s_to_unicode_map.put(0xE9, 0x398); + s_to_unicode_map.put(0xEA, 0x3A9); + s_to_unicode_map.put(0xEB, 0x3B4); + s_to_unicode_map.put(0xEC, 0x221E); + s_to_unicode_map.put(0xED, 0x3C6); + s_to_unicode_map.put(0xEE, 0x3B5); + s_to_unicode_map.put(0xEF, 0x2229); + s_to_unicode_map.put(0xF0, 0x2261); + s_to_unicode_map.put(0xF1, 0xB1); + s_to_unicode_map.put(0xF2, 0x2265); + s_to_unicode_map.put(0xF3, 0x2264); + s_to_unicode_map.put(0xF4, 0x2320); + s_to_unicode_map.put(0xF5, 0x2321); + s_to_unicode_map.put(0xF6, 0xF7); + s_to_unicode_map.put(0xF7, 0x2248); + s_to_unicode_map.put(0xF8, 0xB0); + s_to_unicode_map.put(0xF9, 0x2219); + s_to_unicode_map.put(0xFA, 0xB7); + s_to_unicode_map.put(0xFB, 0x221A); + s_to_unicode_map.put(0xFC, 0x207F); + s_to_unicode_map.put(0xFD, 0xB2); + s_to_unicode_map.put(0xFE, 0x25A0); + s_to_unicode_map.put(0xFF, 0xA0); + } + return s_to_unicode_map; +} + } From 359197e0e66df4f31079db2e52cad6339c06db68 Mon Sep 17 00:00:00 2001 From: Tim Morgan Date: Tue, 25 Jun 2024 08:15:30 -0500 Subject: [PATCH 2/2] Honor source encoding argument in String#encode --- include/natalie/object.hpp | 1 + spec/core/string/encode_spec.rb | 4 +- spec/core/string/shared/encode.rb | 78 +++++++++++++------------------ src/object.cpp | 6 +++ src/string_object.cpp | 9 ++-- 5 files changed, 47 insertions(+), 51 deletions(-) diff --git a/include/natalie/object.hpp b/include/natalie/object.hpp index b889e36c3..8c8329375 100644 --- a/include/natalie/object.hpp +++ b/include/natalie/object.hpp @@ -223,6 +223,7 @@ class Object : public Cell { ArrayObject *as_array_or_raise(Env *); ClassObject *as_class_or_raise(Env *); + EncodingObject *as_encoding_or_raise(Env *); ExceptionObject *as_exception_or_raise(Env *); FloatObject *as_float_or_raise(Env *); HashObject *as_hash_or_raise(Env *); diff --git a/spec/core/string/encode_spec.rb b/spec/core/string/encode_spec.rb index 66e66f107..d73ae7399 100644 --- a/spec/core/string/encode_spec.rb +++ b/spec/core/string/encode_spec.rb @@ -136,7 +136,7 @@ describe "when passed to, from" do it "returns a copy in the destination encoding when both encodings are the same" do - NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /from ASCII-8BIT to UTF-8/ do + NATFIXME 'same encoding for source and destination', exception: SpecFailedException, message: '# should be == to #' do str = "あ".dup.force_encoding("binary") encoded = str.encode("utf-8", "utf-8") @@ -172,7 +172,7 @@ end it "returns a copy in the destination encoding when both encodings are the same" do - NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /from ASCII-8BIT to UTF-8/ do + NATFIXME 'same encoding for source and destination', exception: SpecFailedException, message: '# should be == to #' do str = "あ".dup.force_encoding("binary") encoded = str.encode("utf-8", "utf-8", invalid: :replace) diff --git a/spec/core/string/shared/encode.rb b/spec/core/string/shared/encode.rb index 040bc88e7..be1d7e321 100644 --- a/spec/core/string/shared/encode.rb +++ b/spec/core/string/shared/encode.rb @@ -127,25 +127,21 @@ describe "when passed to, from" do it "transcodes between the encodings ignoring the String encoding" do - NATFIXME 'honor source encoding', exception: SpecFailedException, message: /should be ==/ do - str = "あ" - result = [0xA6, 0xD0, 0x8F, 0xAB, 0xE4, 0x8F, 0xAB, 0xB1].pack('C8') - result.force_encoding Encoding::EUC_JP - str.send(@method, "euc-jp", "ibm437").should == result - end + str = "あ" + result = [0xA6, 0xD0, 0x8F, 0xAB, 0xE4, 0x8F, 0xAB, 0xB1].pack('C8') + result.force_encoding Encoding::EUC_JP + str.send(@method, "euc-jp", "ibm437").should == result end it "calls #to_str to convert the from object to an Encoding" do - NATFIXME 'honor source encoding', exception: SpecFailedException, message: /should be ==/ do - enc = mock("string encode encoding") - enc.should_receive(:to_str).and_return("ibm437") + enc = mock("string encode encoding") + enc.should_receive(:to_str).and_return("ibm437") - str = "あ" - result = [0xA6, 0xD0, 0x8F, 0xAB, 0xE4, 0x8F, 0xAB, 0xB1].pack('C8') - result.force_encoding Encoding::EUC_JP + str = "あ" + result = [0xA6, 0xD0, 0x8F, 0xAB, 0xE4, 0x8F, 0xAB, 0xB1].pack('C8') + result.force_encoding Encoding::EUC_JP - str.send(@method, "euc-jp", enc).should == result - end + str.send(@method, "euc-jp", enc).should == result end end @@ -174,53 +170,43 @@ describe "when passed to, from, options" do it "replaces undefined characters in the destination encoding" do - NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /to UTF-8 in conversion from ASCII-8BIT to UTF-8 to EUC-JP/ do - str = "あ?あ".force_encoding Encoding::BINARY - result = str.send(@method, "euc-jp", "utf-8", undef: :replace) - xA4xA2 = [0xA4, 0xA2].pack('CC').force_encoding('utf-8') - result.should == "#{xA4xA2}?#{xA4xA2}".force_encoding("euc-jp") - end + str = "あ?あ".force_encoding Encoding::BINARY + result = str.send(@method, "euc-jp", "utf-8", undef: :replace) + xA4xA2 = [0xA4, 0xA2].pack('CC').force_encoding('utf-8') + result.should == "#{xA4xA2}?#{xA4xA2}".force_encoding("euc-jp") end it "replaces invalid characters in the destination encoding" do - NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /to UTF-8 in conversion from ASCII-8BIT to UTF-8 to ISO-8859-1/ do - xFF = [0xFF].pack('C').force_encoding('utf-8') - str = "ab#{xFF}c".force_encoding Encoding::BINARY - str.send(@method, "iso-8859-1", "utf-8", invalid: :replace).should == "ab?c" - end + xFF = [0xFF].pack('C').force_encoding('utf-8') + str = "ab#{xFF}c".force_encoding Encoding::BINARY + str.send(@method, "iso-8859-1", "utf-8", invalid: :replace).should == "ab?c" end it "calls #to_str to convert the to object to an encoding" do - NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /to UTF-8 in conversion from ASCII-8BIT to UTF-8 to ISO-8859-1/ do - to = mock("string encode to encoding") - to.should_receive(:to_str).and_return("iso-8859-1") + to = mock("string encode to encoding") + to.should_receive(:to_str).and_return("iso-8859-1") - xFF = [0xFF].pack('C').force_encoding('utf-8') - str = "ab#{xFF}c".force_encoding Encoding::BINARY - str.send(@method, to, "utf-8", invalid: :replace).should == "ab?c" - end + xFF = [0xFF].pack('C').force_encoding('utf-8') + str = "ab#{xFF}c".force_encoding Encoding::BINARY + str.send(@method, to, "utf-8", invalid: :replace).should == "ab?c" end it "calls #to_str to convert the from object to an encoding" do - NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /to UTF-8 in conversion from ASCII-8BIT to UTF-8 to ISO-8859-1/ do - from = mock("string encode to encoding") - from.should_receive(:to_str).and_return("utf-8") + from = mock("string encode to encoding") + from.should_receive(:to_str).and_return("utf-8") - xFF = [0xFF].pack('C').force_encoding('utf-8') - str = "ab#{xFF}c".force_encoding Encoding::BINARY - str.send(@method, "iso-8859-1", from, invalid: :replace).should == "ab?c" - end + xFF = [0xFF].pack('C').force_encoding('utf-8') + str = "ab#{xFF}c".force_encoding Encoding::BINARY + str.send(@method, "iso-8859-1", from, invalid: :replace).should == "ab?c" end it "calls #to_hash to convert the options object" do - NATFIXME 'keyword splat should call to_hash?' do - options = mock("string encode options") - options.should_receive(:to_hash).and_return({ invalid: :replace }) + options = mock("string encode options") + options.should_receive(:to_hash).and_return({ invalid: :replace }) - xFF = [0xFF].pack('C').force_encoding('utf-8') - str = "ab#{xFF}c".force_encoding Encoding::BINARY - str.send(@method, "iso-8859-1", "utf-8", **options).should == "ab?c" - end + xFF = [0xFF].pack('C').force_encoding('utf-8') + str = "ab#{xFF}c".force_encoding Encoding::BINARY + str.send(@method, "iso-8859-1", "utf-8", **options).should == "ab?c" end end diff --git a/src/object.cpp b/src/object.cpp index 82c3c6511..890d99c9d 100644 --- a/src/object.cpp +++ b/src/object.cpp @@ -547,6 +547,12 @@ ClassObject *Object::as_class_or_raise(Env *env) { return static_cast(this); } +EncodingObject *Object::as_encoding_or_raise(Env *env) { + if (!is_encoding()) + env->raise("TypeError", "{} can't be coerced into Encoding", m_klass->inspect_str()); + return static_cast(this); +} + ExceptionObject *Object::as_exception_or_raise(Env *env) { if (!is_exception()) env->raise("TypeError", "{} can't be coerced into Exception", m_klass->inspect_str()); diff --git a/src/string_object.cpp b/src/string_object.cpp index 971c7b58b..8f9d58569 100644 --- a/src/string_object.cpp +++ b/src/string_object.cpp @@ -1137,6 +1137,9 @@ Value StringObject::encode_in_place(Env *env, Value dst_encoding, Value src_enco if (!dst_encoding) dst_encoding = EncodingObject::get(Encoding::UTF_8); + if (!src_encoding) + src_encoding = m_encoding; + EncodeOptions options; if (kwargs) { if (kwargs->remove(env, "universal_newline"_s)) @@ -1172,9 +1175,9 @@ Value StringObject::encode_in_place(Env *env, Value dst_encoding, Value src_enco } env->ensure_no_extra_keywords(kwargs); - auto orig_encoding = m_encoding; - EncodingObject *encoding_obj = EncodingObject::find_encoding(env, dst_encoding); - return encoding_obj->encode(env, orig_encoding, this, options); + EncodingObject *dst_encoding_obj = EncodingObject::find_encoding(env, dst_encoding); + EncodingObject *src_encoding_obj = EncodingObject::find_encoding(env, src_encoding); + return dst_encoding_obj->encode(env, src_encoding_obj, this, options); } Value StringObject::force_encoding(Env *env, Value encoding) {