diff --git a/include/natalie/encoding_object.hpp b/include/natalie/encoding_object.hpp index 879a36fef..e2947c246 100644 --- a/include/natalie/encoding_object.hpp +++ b/include/natalie/encoding_object.hpp @@ -14,12 +14,25 @@ namespace Natalie { +const int SPECIAL_CASE_LOWER_MAX_SIZE = 2; +const int SPECIAL_CASE_TITLE_MAX_SIZE = 3; +const int SPECIAL_CASE_UPPER_MAX_SIZE = 3; + +struct SpecialCasingEntry { + uint32_t code; + uint32_t lower[SPECIAL_CASE_LOWER_MAX_SIZE]; + uint32_t title[SPECIAL_CASE_TITLE_MAX_SIZE]; + uint32_t upper[SPECIAL_CASE_UPPER_MAX_SIZE]; +}; + extern nat_int_t lcase_map[]; extern nat_int_t ucase_map[]; extern nat_int_t tcase_map[]; extern nat_int_t lcase_index[]; extern nat_int_t ucase_index[]; extern nat_int_t tcase_index[]; +extern const int special_casing_map_size; +extern SpecialCasingEntry special_casing_map[]; using namespace TM; @@ -82,9 +95,13 @@ class EncodingObject : public Object { static EncodingObject *find_encoding_by_name(Env *env, String name); static EncodingObject *find_encoding(Env *env, Value encoding); - static nat_int_t codepoint_to_lowercase(nat_int_t codepoint, bool ascii_only = false); - static nat_int_t codepoint_to_uppercase(nat_int_t codepoint, bool ascii_only = false); - static nat_int_t codepoint_to_titlecase(nat_int_t codepoint, bool ascii_only = false); + // must pass a buffer of nat_int_t to this function; uint8_t return is number of codepoints written + static uint8_t codepoint_to_lowercase(nat_int_t codepoint, nat_int_t result[], bool ascii_only = false); + static uint8_t codepoint_to_uppercase(nat_int_t codepoint, nat_int_t result[], bool ascii_only = false); + static uint8_t codepoint_to_titlecase(nat_int_t codepoint, nat_int_t result[], bool ascii_only = false); + + static void init_special_casing_map(); + static SpecialCasingEntry find_special_casing_map_entry(nat_int_t codepoint); static Value casefold_common(nat_int_t codepoint); static Value casefold_full(nat_int_t codepoint); diff --git a/lib/natalie/encoding/casemap_gen.rb b/lib/natalie/encoding/casemap_gen.rb index 507b4482c..ed32303cd 100644 --- a/lib/natalie/encoding/casemap_gen.rb +++ b/lib/natalie/encoding/casemap_gen.rb @@ -68,6 +68,28 @@ def build_map_and_index(blocks) ucase_map, ucase_index = build_map_and_index(ucase_blocks) tcase_map, tcase_index = build_map_and_index(tcase_blocks) +unless File.exist?('/tmp/SpecialCasing.txt') + File.write( + '/tmp/SpecialCasing.txt', + URI.open('http://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt').read + ) +end + +data = File.read('/tmp/SpecialCasing.txt') + .split(/# Conditional Mappings\n/) + .first + .split(/\n/) + .reject { |l| l.start_with?('#') || l.strip.empty? } +special_casing_map = data.map do |line| + parts = line.sub!(/\s*#.*$/, '').split(/\s*;\s*/) + code, lower, title, upper = parts.map { |cc| cc.split.map { |c| c.to_i(16) } } + code = code.first + lower << 0 if lower.size < 2 + title << 0 if title.size < 3 + upper << 0 if upper.size < 3 + { code:, lower:, title:, upper: } +end + puts '// This file is auto-generated from http://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt' puts '// See casemap_gen.rb in this repository for instructions regenerating it.' puts '// DO NOT EDIT THIS FILE BY HAND!' @@ -89,4 +111,17 @@ def build_map_and_index(blocks) puts puts "nat_int_t tcase_index[] = {\n#{format_array(tcase_index, count_per_line: 10, num_width: 4)}\n};" puts +puts "const int special_casing_map_size = #{special_casing_map.size};" +puts "SpecialCasingEntry special_casing_map[#{special_casing_map.size}] = { { 0 } };" +puts +puts 'void EncodingObject::init_special_casing_map() {' +special_casing_map.each_with_index do |mapping, index| + code = "0x#{mapping[:code].to_s(16)}" + lower = "{ #{mapping[:lower].map { |c| "0x#{c.to_s(16)}" }.join(', ')} }" + title = "{ #{mapping[:title].map { |c| "0x#{c.to_s(16)}" }.join(', ')} }" + upper = "{ #{mapping[:upper].map { |c| "0x#{c.to_s(16)}" }.join(', ')} }" + puts " special_casing_map[#{index}] = { #{code}, #{lower}, #{title}, #{upper} };" +end +puts '}' +puts puts '}' diff --git a/spec/core/string/capitalize_spec.rb b/spec/core/string/capitalize_spec.rb index 34f78f4d6..cb4174f07 100644 --- a/spec/core/string/capitalize_spec.rb +++ b/spec/core/string/capitalize_spec.rb @@ -19,20 +19,16 @@ end it "only capitalizes the first resulting character when upcasing a character produces a multi-character sequence" do - NATFIXME 'Pending unicode casemap support', exception: SpecFailedException do - "ß".capitalize.should == "Ss" - end + "ß".capitalize.should == "Ss" end it "updates string metadata" do - NATFIXME 'Pending unicode casemap support', exception: SpecFailedException do - capitalized = "ßeT".capitalize + capitalized = "ßeT".capitalize - capitalized.should == "Sset" - capitalized.size.should == 4 - capitalized.bytesize.should == 4 - capitalized.ascii_only?.should be_true - end + capitalized.should == "Sset" + capitalized.size.should == 4 + capitalized.bytesize.should == 4 + capitalized.ascii_only?.should be_true end end @@ -119,11 +115,9 @@ end it "only capitalizes the first resulting character when upcasing a character produces a multi-character sequence" do - NATFIXME 'Pending unicode casemap support', exception: SpecFailedException do - a = "ß" - a.capitalize! - a.should == "Ss" - end + a = "ß" + a.capitalize! + a.should == "Ss" end it "works for non-ascii-compatible encodings" do @@ -133,15 +127,13 @@ end it "updates string metadata" do - NATFIXME 'Pending unicode casemap support', exception: SpecFailedException do - capitalized = "ßeT" - capitalized.capitalize! + capitalized = "ßeT" + capitalized.capitalize! - capitalized.should == "Sset" - capitalized.size.should == 4 - capitalized.bytesize.should == 4 - capitalized.ascii_only?.should be_true - end + capitalized.should == "Sset" + capitalized.size.should == 4 + capitalized.bytesize.should == 4 + capitalized.ascii_only?.should be_true end end diff --git a/spec/core/string/upcase_spec.rb b/spec/core/string/upcase_spec.rb index 970f8a1b5..7f35ab49c 100644 --- a/spec/core/string/upcase_spec.rb +++ b/spec/core/string/upcase_spec.rb @@ -20,12 +20,10 @@ it "updates string metadata" do upcased = "aßet".upcase - NATFIXME 'Pending unicode casemap support', exception: SpecFailedException do - upcased.should == "ASSET" - upcased.size.should == 5 - upcased.bytesize.should == 5 - upcased.ascii_only?.should be_true - end + upcased.should == "ASSET" + upcased.size.should == 5 + upcased.bytesize.should == 5 + upcased.ascii_only?.should be_true end end @@ -59,9 +57,7 @@ describe "full Unicode case mapping adapted for Lithuanian" do it "currently works the same as full Unicode case mapping" do - NATFIXME 'Pending unicode casemap support', exception: SpecFailedException do - "iß".upcase(:lithuanian).should == "ISS" - end + "iß".upcase(:lithuanian).should == "ISS" end it "allows Turkic as an extra option (and applies Turkic semantics)" do @@ -118,12 +114,10 @@ upcased = "aßet" upcased.upcase! - NATFIXME 'Pending unicode casemap support', exception: SpecFailedException do - upcased.should == "ASSET" - upcased.size.should == 5 - upcased.bytesize.should == 5 - upcased.ascii_only?.should be_true - end + upcased.should == "ASSET" + upcased.size.should == 5 + upcased.bytesize.should == 5 + upcased.ascii_only?.should be_true end end @@ -167,9 +161,7 @@ it "currently works the same as full Unicode case mapping" do a = "iß" a.upcase!(:lithuanian) - NATFIXME 'Pending unicode casemap support', exception: SpecFailedException do - a.should == "ISS" - end + a.should == "ISS" end it "allows Turkic as an extra option (and applies Turkic semantics)" do diff --git a/src/encoding/casemap.cpp b/src/encoding/casemap.cpp index 93b85536a..f09f85d45 100644 --- a/src/encoding/casemap.cpp +++ b/src/encoding/casemap.cpp @@ -2010,7 +2010,7 @@ nat_int_t lcase_index[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0 + 0, 0, 0, 0 }; nat_int_t ucase_index[] = { @@ -2034,7 +2034,7 @@ nat_int_t ucase_index[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0 + 0, 0, 0, 0 }; nat_int_t tcase_index[] = { @@ -2058,7 +2058,116 @@ nat_int_t tcase_index[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0 + 0, 0, 0, 0 }; +const int special_casing_map_size = 103; +SpecialCasingEntry special_casing_map[103] = { { 0 } }; + +void EncodingObject::init_special_casing_map() { + special_casing_map[0] = { 0xdf, { 0xdf, 0x0 }, { 0x53, 0x73, 0x0 }, { 0x53, 0x53, 0x0 } }; + special_casing_map[1] = { 0x130, { 0x69, 0x307 }, { 0x130, 0x0 }, { 0x130, 0x0 } }; + special_casing_map[2] = { 0xfb00, { 0xfb00, 0x0 }, { 0x46, 0x66, 0x0 }, { 0x46, 0x46, 0x0 } }; + special_casing_map[3] = { 0xfb01, { 0xfb01, 0x0 }, { 0x46, 0x69, 0x0 }, { 0x46, 0x49, 0x0 } }; + special_casing_map[4] = { 0xfb02, { 0xfb02, 0x0 }, { 0x46, 0x6c, 0x0 }, { 0x46, 0x4c, 0x0 } }; + special_casing_map[5] = { 0xfb03, { 0xfb03, 0x0 }, { 0x46, 0x66, 0x69 }, { 0x46, 0x46, 0x49 } }; + special_casing_map[6] = { 0xfb04, { 0xfb04, 0x0 }, { 0x46, 0x66, 0x6c }, { 0x46, 0x46, 0x4c } }; + special_casing_map[7] = { 0xfb05, { 0xfb05, 0x0 }, { 0x53, 0x74, 0x0 }, { 0x53, 0x54, 0x0 } }; + special_casing_map[8] = { 0xfb06, { 0xfb06, 0x0 }, { 0x53, 0x74, 0x0 }, { 0x53, 0x54, 0x0 } }; + special_casing_map[9] = { 0x587, { 0x587, 0x0 }, { 0x535, 0x582, 0x0 }, { 0x535, 0x552, 0x0 } }; + special_casing_map[10] = { 0xfb13, { 0xfb13, 0x0 }, { 0x544, 0x576, 0x0 }, { 0x544, 0x546, 0x0 } }; + special_casing_map[11] = { 0xfb14, { 0xfb14, 0x0 }, { 0x544, 0x565, 0x0 }, { 0x544, 0x535, 0x0 } }; + special_casing_map[12] = { 0xfb15, { 0xfb15, 0x0 }, { 0x544, 0x56b, 0x0 }, { 0x544, 0x53b, 0x0 } }; + special_casing_map[13] = { 0xfb16, { 0xfb16, 0x0 }, { 0x54e, 0x576, 0x0 }, { 0x54e, 0x546, 0x0 } }; + special_casing_map[14] = { 0xfb17, { 0xfb17, 0x0 }, { 0x544, 0x56d, 0x0 }, { 0x544, 0x53d, 0x0 } }; + special_casing_map[15] = { 0x149, { 0x149, 0x0 }, { 0x2bc, 0x4e, 0x0 }, { 0x2bc, 0x4e, 0x0 } }; + special_casing_map[16] = { 0x390, { 0x390, 0x0 }, { 0x399, 0x308, 0x301 }, { 0x399, 0x308, 0x301 } }; + special_casing_map[17] = { 0x3b0, { 0x3b0, 0x0 }, { 0x3a5, 0x308, 0x301 }, { 0x3a5, 0x308, 0x301 } }; + special_casing_map[18] = { 0x1f0, { 0x1f0, 0x0 }, { 0x4a, 0x30c, 0x0 }, { 0x4a, 0x30c, 0x0 } }; + special_casing_map[19] = { 0x1e96, { 0x1e96, 0x0 }, { 0x48, 0x331, 0x0 }, { 0x48, 0x331, 0x0 } }; + special_casing_map[20] = { 0x1e97, { 0x1e97, 0x0 }, { 0x54, 0x308, 0x0 }, { 0x54, 0x308, 0x0 } }; + special_casing_map[21] = { 0x1e98, { 0x1e98, 0x0 }, { 0x57, 0x30a, 0x0 }, { 0x57, 0x30a, 0x0 } }; + special_casing_map[22] = { 0x1e99, { 0x1e99, 0x0 }, { 0x59, 0x30a, 0x0 }, { 0x59, 0x30a, 0x0 } }; + special_casing_map[23] = { 0x1e9a, { 0x1e9a, 0x0 }, { 0x41, 0x2be, 0x0 }, { 0x41, 0x2be, 0x0 } }; + special_casing_map[24] = { 0x1f50, { 0x1f50, 0x0 }, { 0x3a5, 0x313, 0x0 }, { 0x3a5, 0x313, 0x0 } }; + special_casing_map[25] = { 0x1f52, { 0x1f52, 0x0 }, { 0x3a5, 0x313, 0x300 }, { 0x3a5, 0x313, 0x300 } }; + special_casing_map[26] = { 0x1f54, { 0x1f54, 0x0 }, { 0x3a5, 0x313, 0x301 }, { 0x3a5, 0x313, 0x301 } }; + special_casing_map[27] = { 0x1f56, { 0x1f56, 0x0 }, { 0x3a5, 0x313, 0x342 }, { 0x3a5, 0x313, 0x342 } }; + special_casing_map[28] = { 0x1fb6, { 0x1fb6, 0x0 }, { 0x391, 0x342, 0x0 }, { 0x391, 0x342, 0x0 } }; + special_casing_map[29] = { 0x1fc6, { 0x1fc6, 0x0 }, { 0x397, 0x342, 0x0 }, { 0x397, 0x342, 0x0 } }; + special_casing_map[30] = { 0x1fd2, { 0x1fd2, 0x0 }, { 0x399, 0x308, 0x300 }, { 0x399, 0x308, 0x300 } }; + special_casing_map[31] = { 0x1fd3, { 0x1fd3, 0x0 }, { 0x399, 0x308, 0x301 }, { 0x399, 0x308, 0x301 } }; + special_casing_map[32] = { 0x1fd6, { 0x1fd6, 0x0 }, { 0x399, 0x342, 0x0 }, { 0x399, 0x342, 0x0 } }; + special_casing_map[33] = { 0x1fd7, { 0x1fd7, 0x0 }, { 0x399, 0x308, 0x342 }, { 0x399, 0x308, 0x342 } }; + special_casing_map[34] = { 0x1fe2, { 0x1fe2, 0x0 }, { 0x3a5, 0x308, 0x300 }, { 0x3a5, 0x308, 0x300 } }; + special_casing_map[35] = { 0x1fe3, { 0x1fe3, 0x0 }, { 0x3a5, 0x308, 0x301 }, { 0x3a5, 0x308, 0x301 } }; + special_casing_map[36] = { 0x1fe4, { 0x1fe4, 0x0 }, { 0x3a1, 0x313, 0x0 }, { 0x3a1, 0x313, 0x0 } }; + special_casing_map[37] = { 0x1fe6, { 0x1fe6, 0x0 }, { 0x3a5, 0x342, 0x0 }, { 0x3a5, 0x342, 0x0 } }; + special_casing_map[38] = { 0x1fe7, { 0x1fe7, 0x0 }, { 0x3a5, 0x308, 0x342 }, { 0x3a5, 0x308, 0x342 } }; + special_casing_map[39] = { 0x1ff6, { 0x1ff6, 0x0 }, { 0x3a9, 0x342, 0x0 }, { 0x3a9, 0x342, 0x0 } }; + special_casing_map[40] = { 0x1f80, { 0x1f80, 0x0 }, { 0x1f88, 0x0 }, { 0x1f08, 0x399, 0x0 } }; + special_casing_map[41] = { 0x1f81, { 0x1f81, 0x0 }, { 0x1f89, 0x0 }, { 0x1f09, 0x399, 0x0 } }; + special_casing_map[42] = { 0x1f82, { 0x1f82, 0x0 }, { 0x1f8a, 0x0 }, { 0x1f0a, 0x399, 0x0 } }; + special_casing_map[43] = { 0x1f83, { 0x1f83, 0x0 }, { 0x1f8b, 0x0 }, { 0x1f0b, 0x399, 0x0 } }; + special_casing_map[44] = { 0x1f84, { 0x1f84, 0x0 }, { 0x1f8c, 0x0 }, { 0x1f0c, 0x399, 0x0 } }; + special_casing_map[45] = { 0x1f85, { 0x1f85, 0x0 }, { 0x1f8d, 0x0 }, { 0x1f0d, 0x399, 0x0 } }; + special_casing_map[46] = { 0x1f86, { 0x1f86, 0x0 }, { 0x1f8e, 0x0 }, { 0x1f0e, 0x399, 0x0 } }; + special_casing_map[47] = { 0x1f87, { 0x1f87, 0x0 }, { 0x1f8f, 0x0 }, { 0x1f0f, 0x399, 0x0 } }; + special_casing_map[48] = { 0x1f88, { 0x1f80, 0x0 }, { 0x1f88, 0x0 }, { 0x1f08, 0x399, 0x0 } }; + special_casing_map[49] = { 0x1f89, { 0x1f81, 0x0 }, { 0x1f89, 0x0 }, { 0x1f09, 0x399, 0x0 } }; + special_casing_map[50] = { 0x1f8a, { 0x1f82, 0x0 }, { 0x1f8a, 0x0 }, { 0x1f0a, 0x399, 0x0 } }; + special_casing_map[51] = { 0x1f8b, { 0x1f83, 0x0 }, { 0x1f8b, 0x0 }, { 0x1f0b, 0x399, 0x0 } }; + special_casing_map[52] = { 0x1f8c, { 0x1f84, 0x0 }, { 0x1f8c, 0x0 }, { 0x1f0c, 0x399, 0x0 } }; + special_casing_map[53] = { 0x1f8d, { 0x1f85, 0x0 }, { 0x1f8d, 0x0 }, { 0x1f0d, 0x399, 0x0 } }; + special_casing_map[54] = { 0x1f8e, { 0x1f86, 0x0 }, { 0x1f8e, 0x0 }, { 0x1f0e, 0x399, 0x0 } }; + special_casing_map[55] = { 0x1f8f, { 0x1f87, 0x0 }, { 0x1f8f, 0x0 }, { 0x1f0f, 0x399, 0x0 } }; + special_casing_map[56] = { 0x1f90, { 0x1f90, 0x0 }, { 0x1f98, 0x0 }, { 0x1f28, 0x399, 0x0 } }; + special_casing_map[57] = { 0x1f91, { 0x1f91, 0x0 }, { 0x1f99, 0x0 }, { 0x1f29, 0x399, 0x0 } }; + special_casing_map[58] = { 0x1f92, { 0x1f92, 0x0 }, { 0x1f9a, 0x0 }, { 0x1f2a, 0x399, 0x0 } }; + special_casing_map[59] = { 0x1f93, { 0x1f93, 0x0 }, { 0x1f9b, 0x0 }, { 0x1f2b, 0x399, 0x0 } }; + special_casing_map[60] = { 0x1f94, { 0x1f94, 0x0 }, { 0x1f9c, 0x0 }, { 0x1f2c, 0x399, 0x0 } }; + special_casing_map[61] = { 0x1f95, { 0x1f95, 0x0 }, { 0x1f9d, 0x0 }, { 0x1f2d, 0x399, 0x0 } }; + special_casing_map[62] = { 0x1f96, { 0x1f96, 0x0 }, { 0x1f9e, 0x0 }, { 0x1f2e, 0x399, 0x0 } }; + special_casing_map[63] = { 0x1f97, { 0x1f97, 0x0 }, { 0x1f9f, 0x0 }, { 0x1f2f, 0x399, 0x0 } }; + special_casing_map[64] = { 0x1f98, { 0x1f90, 0x0 }, { 0x1f98, 0x0 }, { 0x1f28, 0x399, 0x0 } }; + special_casing_map[65] = { 0x1f99, { 0x1f91, 0x0 }, { 0x1f99, 0x0 }, { 0x1f29, 0x399, 0x0 } }; + special_casing_map[66] = { 0x1f9a, { 0x1f92, 0x0 }, { 0x1f9a, 0x0 }, { 0x1f2a, 0x399, 0x0 } }; + special_casing_map[67] = { 0x1f9b, { 0x1f93, 0x0 }, { 0x1f9b, 0x0 }, { 0x1f2b, 0x399, 0x0 } }; + special_casing_map[68] = { 0x1f9c, { 0x1f94, 0x0 }, { 0x1f9c, 0x0 }, { 0x1f2c, 0x399, 0x0 } }; + special_casing_map[69] = { 0x1f9d, { 0x1f95, 0x0 }, { 0x1f9d, 0x0 }, { 0x1f2d, 0x399, 0x0 } }; + special_casing_map[70] = { 0x1f9e, { 0x1f96, 0x0 }, { 0x1f9e, 0x0 }, { 0x1f2e, 0x399, 0x0 } }; + special_casing_map[71] = { 0x1f9f, { 0x1f97, 0x0 }, { 0x1f9f, 0x0 }, { 0x1f2f, 0x399, 0x0 } }; + special_casing_map[72] = { 0x1fa0, { 0x1fa0, 0x0 }, { 0x1fa8, 0x0 }, { 0x1f68, 0x399, 0x0 } }; + special_casing_map[73] = { 0x1fa1, { 0x1fa1, 0x0 }, { 0x1fa9, 0x0 }, { 0x1f69, 0x399, 0x0 } }; + special_casing_map[74] = { 0x1fa2, { 0x1fa2, 0x0 }, { 0x1faa, 0x0 }, { 0x1f6a, 0x399, 0x0 } }; + special_casing_map[75] = { 0x1fa3, { 0x1fa3, 0x0 }, { 0x1fab, 0x0 }, { 0x1f6b, 0x399, 0x0 } }; + special_casing_map[76] = { 0x1fa4, { 0x1fa4, 0x0 }, { 0x1fac, 0x0 }, { 0x1f6c, 0x399, 0x0 } }; + special_casing_map[77] = { 0x1fa5, { 0x1fa5, 0x0 }, { 0x1fad, 0x0 }, { 0x1f6d, 0x399, 0x0 } }; + special_casing_map[78] = { 0x1fa6, { 0x1fa6, 0x0 }, { 0x1fae, 0x0 }, { 0x1f6e, 0x399, 0x0 } }; + special_casing_map[79] = { 0x1fa7, { 0x1fa7, 0x0 }, { 0x1faf, 0x0 }, { 0x1f6f, 0x399, 0x0 } }; + special_casing_map[80] = { 0x1fa8, { 0x1fa0, 0x0 }, { 0x1fa8, 0x0 }, { 0x1f68, 0x399, 0x0 } }; + special_casing_map[81] = { 0x1fa9, { 0x1fa1, 0x0 }, { 0x1fa9, 0x0 }, { 0x1f69, 0x399, 0x0 } }; + special_casing_map[82] = { 0x1faa, { 0x1fa2, 0x0 }, { 0x1faa, 0x0 }, { 0x1f6a, 0x399, 0x0 } }; + special_casing_map[83] = { 0x1fab, { 0x1fa3, 0x0 }, { 0x1fab, 0x0 }, { 0x1f6b, 0x399, 0x0 } }; + special_casing_map[84] = { 0x1fac, { 0x1fa4, 0x0 }, { 0x1fac, 0x0 }, { 0x1f6c, 0x399, 0x0 } }; + special_casing_map[85] = { 0x1fad, { 0x1fa5, 0x0 }, { 0x1fad, 0x0 }, { 0x1f6d, 0x399, 0x0 } }; + special_casing_map[86] = { 0x1fae, { 0x1fa6, 0x0 }, { 0x1fae, 0x0 }, { 0x1f6e, 0x399, 0x0 } }; + special_casing_map[87] = { 0x1faf, { 0x1fa7, 0x0 }, { 0x1faf, 0x0 }, { 0x1f6f, 0x399, 0x0 } }; + special_casing_map[88] = { 0x1fb3, { 0x1fb3, 0x0 }, { 0x1fbc, 0x0 }, { 0x391, 0x399, 0x0 } }; + special_casing_map[89] = { 0x1fbc, { 0x1fb3, 0x0 }, { 0x1fbc, 0x0 }, { 0x391, 0x399, 0x0 } }; + special_casing_map[90] = { 0x1fc3, { 0x1fc3, 0x0 }, { 0x1fcc, 0x0 }, { 0x397, 0x399, 0x0 } }; + special_casing_map[91] = { 0x1fcc, { 0x1fc3, 0x0 }, { 0x1fcc, 0x0 }, { 0x397, 0x399, 0x0 } }; + special_casing_map[92] = { 0x1ff3, { 0x1ff3, 0x0 }, { 0x1ffc, 0x0 }, { 0x3a9, 0x399, 0x0 } }; + special_casing_map[93] = { 0x1ffc, { 0x1ff3, 0x0 }, { 0x1ffc, 0x0 }, { 0x3a9, 0x399, 0x0 } }; + special_casing_map[94] = { 0x1fb2, { 0x1fb2, 0x0 }, { 0x1fba, 0x345, 0x0 }, { 0x1fba, 0x399, 0x0 } }; + special_casing_map[95] = { 0x1fb4, { 0x1fb4, 0x0 }, { 0x386, 0x345, 0x0 }, { 0x386, 0x399, 0x0 } }; + special_casing_map[96] = { 0x1fc2, { 0x1fc2, 0x0 }, { 0x1fca, 0x345, 0x0 }, { 0x1fca, 0x399, 0x0 } }; + special_casing_map[97] = { 0x1fc4, { 0x1fc4, 0x0 }, { 0x389, 0x345, 0x0 }, { 0x389, 0x399, 0x0 } }; + special_casing_map[98] = { 0x1ff2, { 0x1ff2, 0x0 }, { 0x1ffa, 0x345, 0x0 }, { 0x1ffa, 0x399, 0x0 } }; + special_casing_map[99] = { 0x1ff4, { 0x1ff4, 0x0 }, { 0x38f, 0x345, 0x0 }, { 0x38f, 0x399, 0x0 } }; + special_casing_map[100] = { 0x1fb7, { 0x1fb7, 0x0 }, { 0x391, 0x342, 0x345 }, { 0x391, 0x342, 0x399 } }; + special_casing_map[101] = { 0x1fc7, { 0x1fc7, 0x0 }, { 0x397, 0x342, 0x345 }, { 0x397, 0x342, 0x399 } }; + special_casing_map[102] = { 0x1ff7, { 0x1ff7, 0x0 }, { 0x3a9, 0x342, 0x345 }, { 0x3a9, 0x342, 0x399 } }; +} + } diff --git a/src/encoding_object.cpp b/src/encoding_object.cpp index f67a0e2ab..0d0084a8a 100644 --- a/src/encoding_object.cpp +++ b/src/encoding_object.cpp @@ -1,3 +1,4 @@ +#include "natalie/encoding_object.hpp" #include "natalie.hpp" #include #include @@ -243,45 +244,114 @@ void EncodingObject::initialize_defaults(Env *env) { s_filesystem = s_default_external; } -nat_int_t EncodingObject::codepoint_to_lowercase(nat_int_t codepoint, bool ascii_only) { +uint8_t EncodingObject::codepoint_to_lowercase(nat_int_t codepoint, nat_int_t result[], bool ascii_only) { if (ascii_only) { if (codepoint >= 'A' && codepoint <= 'Z') - return codepoint + 32; - return codepoint; + result[0] = codepoint + 32; + else + result[0] = codepoint; + return 1; } auto block = codepoint >> 8; auto index = lcase_index[block] + (codepoint & 0xff); auto delta = lcase_map[index]; - if (delta == 0) - return codepoint; - return codepoint + delta; + if (delta != 0) { + result[0] = codepoint + delta; + return 1; + } + + if (special_casing_map[0].code == 0) + init_special_casing_map(); + auto entry = find_special_casing_map_entry(codepoint); + if (entry.code != 0) { + int i = 0; + for (i = 0; i < SPECIAL_CASE_LOWER_MAX_SIZE; i++) { + if (entry.lower[i] == 0) break; + result[i] = entry.lower[i]; + } + return i; + } + + result[0] = codepoint; + return 1; } -nat_int_t EncodingObject::codepoint_to_uppercase(nat_int_t codepoint, bool ascii_only) { +uint8_t EncodingObject::codepoint_to_uppercase(nat_int_t codepoint, nat_int_t result[], bool ascii_only) { if (ascii_only) { if (codepoint >= 'a' && codepoint <= 'z') - return codepoint - 32; - return codepoint; + result[0] = codepoint - 32; + else + result[0] = codepoint; + return 1; } auto block = codepoint >> 8; auto index = ucase_index[block] + (codepoint & 0xff); auto delta = ucase_map[index]; - if (delta == 0) - return codepoint; - return codepoint + delta; + if (delta != 0) { + result[0] = codepoint + delta; + return 1; + } + + if (special_casing_map[0].code == 0) + init_special_casing_map(); + auto entry = find_special_casing_map_entry(codepoint); + if (entry.code != 0) { + int i = 0; + for (i = 0; i < SPECIAL_CASE_UPPER_MAX_SIZE; i++) { + if (entry.upper[i] == 0) break; + result[i] = entry.upper[i]; + } + return i; + } + + result[0] = codepoint; + return 1; } -nat_int_t EncodingObject::codepoint_to_titlecase(nat_int_t codepoint, bool ascii_only) { - if (ascii_only) return codepoint_to_uppercase(codepoint, true); +uint8_t EncodingObject::codepoint_to_titlecase(nat_int_t codepoint, nat_int_t result[], bool ascii_only) { + if (ascii_only) return codepoint_to_uppercase(codepoint, result, true); auto block = codepoint >> 8; auto index = tcase_index[block] + (codepoint & 0xff); auto delta = tcase_map[index]; - if (delta == 0) - return codepoint; - return codepoint + delta; + if (delta != 0) { + result[0] = codepoint + delta; + return 1; + } + + if (special_casing_map[0].code == 0) + init_special_casing_map(); + auto entry = find_special_casing_map_entry(codepoint); + if (entry.code != 0) { + int i = 0; + for (i = 0; i < SPECIAL_CASE_TITLE_MAX_SIZE; i++) { + if (entry.title[i] == 0) break; + result[i] = entry.title[i]; + } + return i; + } + + result[0] = codepoint; + return 1; +} + +SpecialCasingEntry EncodingObject::find_special_casing_map_entry(nat_int_t codepoint) { + int low = 0; + int high = special_casing_map_size - 1; + + while (low <= high) { + int mid = low + (high - low) / 2; + if (special_casing_map[mid].code == codepoint) + return special_casing_map[mid]; + if (special_casing_map[mid].code < codepoint) + low = mid + 1; + else + high = mid - 1; + } + + return {}; } bool EncodingObject::is_printable_char(const nat_int_t c) const { diff --git a/src/string_object.cpp b/src/string_object.cpp index a93fa5c01..e0d0deae8 100644 --- a/src/string_object.cpp +++ b/src/string_object.cpp @@ -2820,13 +2820,16 @@ StringObject *StringObject::capitalize(Env *env, Value arg1, Value arg2) { auto str = new StringObject { "", m_encoding }; bool first_char = true; auto ascii_only = flags & Ascii; + nat_int_t result[3] = {}; + uint8_t length = 0; for (StringView c : *this) { nat_int_t codepoint = m_encoding->decode_codepoint(c); if (first_char) - codepoint = EncodingObject::codepoint_to_titlecase(codepoint, ascii_only); + length = EncodingObject::codepoint_to_titlecase(codepoint, result, ascii_only); else - codepoint = EncodingObject::codepoint_to_lowercase(codepoint, ascii_only); - str->append(m_encoding->encode_codepoint(codepoint)); + length = EncodingObject::codepoint_to_lowercase(codepoint, result, ascii_only); + for (uint8_t i = 0; i < length; i++) + str->append(m_encoding->encode_codepoint(result[i])); first_char = false; } return str; @@ -2845,11 +2848,12 @@ Value StringObject::capitalize_in_place(Env *env, Value arg1, Value arg2) { StringObject *StringObject::downcase(Env *env, Value arg1, Value arg2) { auto flags = check_case_options(env, arg1, arg2, Downcase); auto str = new StringObject { "", m_encoding }; + nat_int_t result[3] = {}; for (StringView c : *this) { auto codepoint = m_encoding->decode_codepoint(c); if (flags & Ascii) { - codepoint = EncodingObject::codepoint_to_lowercase(codepoint, true); - str->append(m_encoding->encode_codepoint(codepoint)); + EncodingObject::codepoint_to_lowercase(codepoint, result, true); + str->append(m_encoding->encode_codepoint(result[0])); } else if ((flags & Fold || flags & FoldLithuanian) && !(flags & FoldTurkicAzeri)) { auto result = EncodingObject::casefold_full(codepoint); if (result->is_array()) { @@ -2862,8 +2866,9 @@ StringObject *StringObject::downcase(Env *env, Value arg1, Value arg2) { str->append(m_encoding->encode_codepoint(codepoint)); } } else { - codepoint = EncodingObject::codepoint_to_lowercase(codepoint); - str->append(m_encoding->encode_codepoint(codepoint)); + auto length = EncodingObject::codepoint_to_lowercase(codepoint, result); + for (uint8_t i = 0; i < length; i++) + str->append(m_encoding->encode_codepoint(result[i])); } } return str; @@ -2893,10 +2898,12 @@ StringObject *StringObject::upcase(Env *env, Value arg1, Value arg2) { auto flags = check_case_options(env, arg1, arg2, Upcase); auto str = new StringObject { "", m_encoding }; auto ascii_only = flags & Ascii; + nat_int_t result[3] = {}; for (StringView c : *this) { auto codepoint = m_encoding->decode_codepoint(c); - codepoint = EncodingObject::codepoint_to_uppercase(codepoint, ascii_only); - str->append(m_encoding->encode_codepoint(codepoint)); + auto length = EncodingObject::codepoint_to_uppercase(codepoint, result, ascii_only); + for (uint8_t i = 0; i < length; i++) + str->append(m_encoding->encode_codepoint(result[i])); } return str; }