Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Honor source encoding argument in String#encode #2141

Merged
merged 2 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions include/natalie/encoding/ibm437_encoding_object.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ class Ibm437EncodingObject : public EncodingObject {
virtual nat_int_t decode_codepoint(StringView &str) const override;

virtual bool is_single_byte_encoding() const override final { return true; }

private:
static const TM::Hashmap<nat_int_t, nat_int_t> &to_unicode_map();

static inline TM::Hashmap<nat_int_t, nat_int_t> s_to_unicode_map {};
};

}
1 change: 1 addition & 0 deletions include/natalie/object.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ class Object : public Cell {

ArrayObject *as_array_or_raise(Env *);
ClassObject *as_class_or_raise(Env *);
EncodingObject *as_encoding_or_raise(Env *);
ExceptionObject *as_exception_or_raise(Env *);
FloatObject *as_float_or_raise(Env *);
HashObject *as_hash_or_raise(Env *);
Expand Down
4 changes: 2 additions & 2 deletions spec/core/string/encode_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@

describe "when passed to, from" do
it "returns a copy in the destination encoding when both encodings are the same" do
NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /from ASCII-8BIT to UTF-8/ do
NATFIXME 'same encoding for source and destination', exception: SpecFailedException, message: '#<Encoding:ASCII-8BIT> should be == to #<Encoding:UTF-8>' do
str = "あ".dup.force_encoding("binary")
encoded = str.encode("utf-8", "utf-8")

Expand Down Expand Up @@ -172,7 +172,7 @@
end

it "returns a copy in the destination encoding when both encodings are the same" do
NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /from ASCII-8BIT to UTF-8/ do
NATFIXME 'same encoding for source and destination', exception: SpecFailedException, message: '#<Encoding:ASCII-8BIT> should be == to #<Encoding:UTF-8>' do
str = "あ".dup.force_encoding("binary")
encoded = str.encode("utf-8", "utf-8", invalid: :replace)

Expand Down
78 changes: 32 additions & 46 deletions spec/core/string/shared/encode.rb
Original file line number Diff line number Diff line change
Expand Up @@ -127,25 +127,21 @@

describe "when passed to, from" do
it "transcodes between the encodings ignoring the String encoding" do
NATFIXME 'honor source encoding', exception: SpecFailedException, message: /should be ==/ do
str = "あ"
result = [0xA6, 0xD0, 0x8F, 0xAB, 0xE4, 0x8F, 0xAB, 0xB1].pack('C8')
result.force_encoding Encoding::EUC_JP
str.send(@method, "euc-jp", "ibm437").should == result
end
str = "あ"
result = [0xA6, 0xD0, 0x8F, 0xAB, 0xE4, 0x8F, 0xAB, 0xB1].pack('C8')
result.force_encoding Encoding::EUC_JP
str.send(@method, "euc-jp", "ibm437").should == result
end

it "calls #to_str to convert the from object to an Encoding" do
NATFIXME 'honor source encoding', exception: SpecFailedException, message: /should be ==/ do
enc = mock("string encode encoding")
enc.should_receive(:to_str).and_return("ibm437")
enc = mock("string encode encoding")
enc.should_receive(:to_str).and_return("ibm437")

str = "あ"
result = [0xA6, 0xD0, 0x8F, 0xAB, 0xE4, 0x8F, 0xAB, 0xB1].pack('C8')
result.force_encoding Encoding::EUC_JP
str = "あ"
result = [0xA6, 0xD0, 0x8F, 0xAB, 0xE4, 0x8F, 0xAB, 0xB1].pack('C8')
result.force_encoding Encoding::EUC_JP

str.send(@method, "euc-jp", enc).should == result
end
str.send(@method, "euc-jp", enc).should == result
end
end

Expand Down Expand Up @@ -174,53 +170,43 @@

describe "when passed to, from, options" do
it "replaces undefined characters in the destination encoding" do
NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /to UTF-8 in conversion from ASCII-8BIT to UTF-8 to EUC-JP/ do
str = "あ?あ".force_encoding Encoding::BINARY
result = str.send(@method, "euc-jp", "utf-8", undef: :replace)
xA4xA2 = [0xA4, 0xA2].pack('CC').force_encoding('utf-8')
result.should == "#{xA4xA2}?#{xA4xA2}".force_encoding("euc-jp")
end
str = "あ?あ".force_encoding Encoding::BINARY
result = str.send(@method, "euc-jp", "utf-8", undef: :replace)
xA4xA2 = [0xA4, 0xA2].pack('CC').force_encoding('utf-8')
result.should == "#{xA4xA2}?#{xA4xA2}".force_encoding("euc-jp")
end

it "replaces invalid characters in the destination encoding" do
NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /to UTF-8 in conversion from ASCII-8BIT to UTF-8 to ISO-8859-1/ do
xFF = [0xFF].pack('C').force_encoding('utf-8')
str = "ab#{xFF}c".force_encoding Encoding::BINARY
str.send(@method, "iso-8859-1", "utf-8", invalid: :replace).should == "ab?c"
end
xFF = [0xFF].pack('C').force_encoding('utf-8')
str = "ab#{xFF}c".force_encoding Encoding::BINARY
str.send(@method, "iso-8859-1", "utf-8", invalid: :replace).should == "ab?c"
end

it "calls #to_str to convert the to object to an encoding" do
NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /to UTF-8 in conversion from ASCII-8BIT to UTF-8 to ISO-8859-1/ do
to = mock("string encode to encoding")
to.should_receive(:to_str).and_return("iso-8859-1")
to = mock("string encode to encoding")
to.should_receive(:to_str).and_return("iso-8859-1")

xFF = [0xFF].pack('C').force_encoding('utf-8')
str = "ab#{xFF}c".force_encoding Encoding::BINARY
str.send(@method, to, "utf-8", invalid: :replace).should == "ab?c"
end
xFF = [0xFF].pack('C').force_encoding('utf-8')
str = "ab#{xFF}c".force_encoding Encoding::BINARY
str.send(@method, to, "utf-8", invalid: :replace).should == "ab?c"
end

it "calls #to_str to convert the from object to an encoding" do
NATFIXME 'honor source encoding', exception: Encoding::UndefinedConversionError, message: /to UTF-8 in conversion from ASCII-8BIT to UTF-8 to ISO-8859-1/ do
from = mock("string encode to encoding")
from.should_receive(:to_str).and_return("utf-8")
from = mock("string encode to encoding")
from.should_receive(:to_str).and_return("utf-8")

xFF = [0xFF].pack('C').force_encoding('utf-8')
str = "ab#{xFF}c".force_encoding Encoding::BINARY
str.send(@method, "iso-8859-1", from, invalid: :replace).should == "ab?c"
end
xFF = [0xFF].pack('C').force_encoding('utf-8')
str = "ab#{xFF}c".force_encoding Encoding::BINARY
str.send(@method, "iso-8859-1", from, invalid: :replace).should == "ab?c"
end

it "calls #to_hash to convert the options object" do
NATFIXME 'keyword splat should call to_hash?' do
options = mock("string encode options")
options.should_receive(:to_hash).and_return({ invalid: :replace })
options = mock("string encode options")
options.should_receive(:to_hash).and_return({ invalid: :replace })

xFF = [0xFF].pack('C').force_encoding('utf-8')
str = "ab#{xFF}c".force_encoding Encoding::BINARY
str.send(@method, "iso-8859-1", "utf-8", **options).should == "ab?c"
end
xFF = [0xFF].pack('C').force_encoding('utf-8')
str = "ab#{xFF}c".force_encoding Encoding::BINARY
str.send(@method, "iso-8859-1", "utf-8", **options).should == "ab?c"
end
end

Expand Down
141 changes: 140 additions & 1 deletion src/encoding/ibm437_encoding_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,12 @@ String Ibm437EncodingObject::escaped_char(const nat_int_t c) const {
nat_int_t Ibm437EncodingObject::to_unicode_codepoint(nat_int_t codepoint) const {
if (codepoint >= 0x00 && codepoint <= 0x7F)
return codepoint;
NAT_NOT_YET_IMPLEMENTED("Conversion above Unicode Basic Latin (0x00..0x7F) not implemented");

auto result = to_unicode_map().get(codepoint);
if (result == 0)
return -1;

return result;
}

nat_int_t Ibm437EncodingObject::from_unicode_codepoint(nat_int_t codepoint) const {
Expand All @@ -48,4 +53,138 @@ nat_int_t Ibm437EncodingObject::decode_codepoint(StringView &str) const {
}
}

const TM::Hashmap<nat_int_t, nat_int_t> &Ibm437EncodingObject::to_unicode_map() {
if (s_to_unicode_map.is_empty()) {
s_to_unicode_map.put(0x80, 0xC7);
s_to_unicode_map.put(0x81, 0xFC);
s_to_unicode_map.put(0x82, 0xE9);
s_to_unicode_map.put(0x83, 0xE2);
s_to_unicode_map.put(0x84, 0xE4);
s_to_unicode_map.put(0x85, 0xE0);
s_to_unicode_map.put(0x86, 0xE5);
s_to_unicode_map.put(0x87, 0xE7);
s_to_unicode_map.put(0x88, 0xEA);
s_to_unicode_map.put(0x89, 0xEB);
s_to_unicode_map.put(0x8A, 0xE8);
s_to_unicode_map.put(0x8B, 0xEF);
s_to_unicode_map.put(0x8C, 0xEE);
s_to_unicode_map.put(0x8D, 0xEC);
s_to_unicode_map.put(0x8E, 0xC4);
s_to_unicode_map.put(0x8F, 0xC5);
s_to_unicode_map.put(0x90, 0xC9);
s_to_unicode_map.put(0x91, 0xE6);
s_to_unicode_map.put(0x92, 0xC6);
s_to_unicode_map.put(0x93, 0xF4);
s_to_unicode_map.put(0x94, 0xF6);
s_to_unicode_map.put(0x95, 0xF2);
s_to_unicode_map.put(0x96, 0xFB);
s_to_unicode_map.put(0x97, 0xF9);
s_to_unicode_map.put(0x98, 0xFF);
s_to_unicode_map.put(0x99, 0xD6);
s_to_unicode_map.put(0x9A, 0xDC);
s_to_unicode_map.put(0x9B, 0xA2);
s_to_unicode_map.put(0x9C, 0xA3);
s_to_unicode_map.put(0x9D, 0xA5);
s_to_unicode_map.put(0x9E, 0x20A7);
s_to_unicode_map.put(0x9F, 0x192);
s_to_unicode_map.put(0xA0, 0xE1);
s_to_unicode_map.put(0xA1, 0xED);
s_to_unicode_map.put(0xA2, 0xF3);
s_to_unicode_map.put(0xA3, 0xFA);
s_to_unicode_map.put(0xA4, 0xF1);
s_to_unicode_map.put(0xA5, 0xD1);
s_to_unicode_map.put(0xA6, 0xAA);
s_to_unicode_map.put(0xA7, 0xBA);
s_to_unicode_map.put(0xA8, 0xBF);
s_to_unicode_map.put(0xA9, 0x2310);
s_to_unicode_map.put(0xAA, 0xAC);
s_to_unicode_map.put(0xAB, 0xBD);
s_to_unicode_map.put(0xAC, 0xBC);
s_to_unicode_map.put(0xAD, 0xA1);
s_to_unicode_map.put(0xAE, 0xAB);
s_to_unicode_map.put(0xAF, 0xBB);
s_to_unicode_map.put(0xB0, 0x2591);
s_to_unicode_map.put(0xB1, 0x2592);
s_to_unicode_map.put(0xB2, 0x2593);
s_to_unicode_map.put(0xB3, 0x2502);
s_to_unicode_map.put(0xB4, 0x2524);
s_to_unicode_map.put(0xB5, 0x2561);
s_to_unicode_map.put(0xB6, 0x2562);
s_to_unicode_map.put(0xB7, 0x2556);
s_to_unicode_map.put(0xB8, 0x2555);
s_to_unicode_map.put(0xB9, 0x2563);
s_to_unicode_map.put(0xBA, 0x2551);
s_to_unicode_map.put(0xBB, 0x2557);
s_to_unicode_map.put(0xBC, 0x255D);
s_to_unicode_map.put(0xBD, 0x255C);
s_to_unicode_map.put(0xBE, 0x255B);
s_to_unicode_map.put(0xBF, 0x2510);
s_to_unicode_map.put(0xC0, 0x2514);
s_to_unicode_map.put(0xC1, 0x2534);
s_to_unicode_map.put(0xC2, 0x252C);
s_to_unicode_map.put(0xC3, 0x251C);
s_to_unicode_map.put(0xC4, 0x2500);
s_to_unicode_map.put(0xC5, 0x253C);
s_to_unicode_map.put(0xC6, 0x255E);
s_to_unicode_map.put(0xC7, 0x255F);
s_to_unicode_map.put(0xC8, 0x255A);
s_to_unicode_map.put(0xC9, 0x2554);
s_to_unicode_map.put(0xCA, 0x2569);
s_to_unicode_map.put(0xCB, 0x2566);
s_to_unicode_map.put(0xCC, 0x2560);
s_to_unicode_map.put(0xCD, 0x2550);
s_to_unicode_map.put(0xCE, 0x256C);
s_to_unicode_map.put(0xCF, 0x2567);
s_to_unicode_map.put(0xD0, 0x2568);
s_to_unicode_map.put(0xD1, 0x2564);
s_to_unicode_map.put(0xD2, 0x2565);
s_to_unicode_map.put(0xD3, 0x2559);
s_to_unicode_map.put(0xD4, 0x2558);
s_to_unicode_map.put(0xD5, 0x2552);
s_to_unicode_map.put(0xD6, 0x2553);
s_to_unicode_map.put(0xD7, 0x256B);
s_to_unicode_map.put(0xD8, 0x256A);
s_to_unicode_map.put(0xD9, 0x2518);
s_to_unicode_map.put(0xDA, 0x250C);
s_to_unicode_map.put(0xDB, 0x2588);
s_to_unicode_map.put(0xDC, 0x2584);
s_to_unicode_map.put(0xDD, 0x258C);
s_to_unicode_map.put(0xDE, 0x2590);
s_to_unicode_map.put(0xDF, 0x2580);
s_to_unicode_map.put(0xE0, 0x3B1);
s_to_unicode_map.put(0xE1, 0xDF);
s_to_unicode_map.put(0xE2, 0x393);
s_to_unicode_map.put(0xE3, 0x3C0);
s_to_unicode_map.put(0xE4, 0x3A3);
s_to_unicode_map.put(0xE5, 0x3C3);
s_to_unicode_map.put(0xE6, 0xB5);
s_to_unicode_map.put(0xE7, 0x3C4);
s_to_unicode_map.put(0xE8, 0x3A6);
s_to_unicode_map.put(0xE9, 0x398);
s_to_unicode_map.put(0xEA, 0x3A9);
s_to_unicode_map.put(0xEB, 0x3B4);
s_to_unicode_map.put(0xEC, 0x221E);
s_to_unicode_map.put(0xED, 0x3C6);
s_to_unicode_map.put(0xEE, 0x3B5);
s_to_unicode_map.put(0xEF, 0x2229);
s_to_unicode_map.put(0xF0, 0x2261);
s_to_unicode_map.put(0xF1, 0xB1);
s_to_unicode_map.put(0xF2, 0x2265);
s_to_unicode_map.put(0xF3, 0x2264);
s_to_unicode_map.put(0xF4, 0x2320);
s_to_unicode_map.put(0xF5, 0x2321);
s_to_unicode_map.put(0xF6, 0xF7);
s_to_unicode_map.put(0xF7, 0x2248);
s_to_unicode_map.put(0xF8, 0xB0);
s_to_unicode_map.put(0xF9, 0x2219);
s_to_unicode_map.put(0xFA, 0xB7);
s_to_unicode_map.put(0xFB, 0x221A);
s_to_unicode_map.put(0xFC, 0x207F);
s_to_unicode_map.put(0xFD, 0xB2);
s_to_unicode_map.put(0xFE, 0x25A0);
s_to_unicode_map.put(0xFF, 0xA0);
}
return s_to_unicode_map;
}

}
6 changes: 6 additions & 0 deletions src/object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,12 @@ ClassObject *Object::as_class_or_raise(Env *env) {
return static_cast<ClassObject *>(this);
}

EncodingObject *Object::as_encoding_or_raise(Env *env) {
if (!is_encoding())
env->raise("TypeError", "{} can't be coerced into Encoding", m_klass->inspect_str());
return static_cast<EncodingObject *>(this);
}

ExceptionObject *Object::as_exception_or_raise(Env *env) {
if (!is_exception())
env->raise("TypeError", "{} can't be coerced into Exception", m_klass->inspect_str());
Expand Down
9 changes: 6 additions & 3 deletions src/string_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1137,6 +1137,9 @@ Value StringObject::encode_in_place(Env *env, Value dst_encoding, Value src_enco
if (!dst_encoding)
dst_encoding = EncodingObject::get(Encoding::UTF_8);

if (!src_encoding)
src_encoding = m_encoding;

EncodeOptions options;
if (kwargs) {
if (kwargs->remove(env, "universal_newline"_s))
Expand Down Expand Up @@ -1172,9 +1175,9 @@ Value StringObject::encode_in_place(Env *env, Value dst_encoding, Value src_enco
}

env->ensure_no_extra_keywords(kwargs);
auto orig_encoding = m_encoding;
EncodingObject *encoding_obj = EncodingObject::find_encoding(env, dst_encoding);
return encoding_obj->encode(env, orig_encoding, this, options);
EncodingObject *dst_encoding_obj = EncodingObject::find_encoding(env, dst_encoding);
EncodingObject *src_encoding_obj = EncodingObject::find_encoding(env, src_encoding);
return dst_encoding_obj->encode(env, src_encoding_obj, this, options);
}

Value StringObject::force_encoding(Env *env, Value encoding) {
Expand Down
Loading