forked from alexsilva/luabit-legacy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utf8.lua
163 lines (139 loc) · 3.98 KB
/
utf8.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
--[[---------------
Utf8 v0.4
-------------------
utf8 -> unicode ucs2 converter
How to use:
to convert:
ucs2_string = utf8.utf_to_uni(utf8_string)
to view a string in hex:
utf8.print_hex(str)
Under the MIT license.
Utf8 is a part of LuaBit Project(http://luaforge.net/projects/bit/).
copyright(c) 2007 hanzhao ([email protected])
--]]---------------
require 'hex'
require 'bit'
do
local BYTE_1_HEAD = hex.to_dec('0x00') -- 0### ####
local BYTE_2_HEAD = hex.to_dec('0xC0') -- 110# ####
local BYTE_3_HEAD = hex.to_dec('0xE0') -- 1110 ####
-- mask to get the head
local BYTE_1_MASK = hex.to_dec('0x80') -- 1### ####
local BYTE_2_MASK = hex.to_dec('0xE0') -- 111# ####
local BYTE_3_MASK = hex.to_dec('0xF0') -- 1111 ####
-- tail byte mask
local TAIL_MASK = hex.to_dec('0x3F') -- 10## ####
local mask_tbl = {
BYTE_3_MASK,
BYTE_2_MASK,
BYTE_1_MASK,
}
local head_tbl = {
BYTE_3_HEAD,
BYTE_2_HEAD,
BYTE_1_HEAD,
}
local len_tbl = {
[BYTE_1_HEAD] = 1,
[BYTE_2_HEAD] = 2,
[BYTE_3_HEAD] = 3,
}
local function utf_read_char(utf, start)
local head_byte = string.byte(utf, start)
--print('head byte ' .. hex.to_hex(head_byte))
for m = 1, table.getn(mask_tbl) do
local mask = mask_tbl[m]
-- head match
local head = bit.band(head_byte, mask)
--print('head ' .. hex.to_hex(head) .. ' ' .. hex.to_hex(mask))
if(head == head_tbl[m]) then
local len = len_tbl[head_tbl[m]]
--print('len ' .. len)
local tail_idx = start + len - 1
local char = 0
-- tail
for i = tail_idx, start + 1, -1 do
local tail_byte = string.byte(utf, i)
local byte = bit.band(tail_byte, TAIL_MASK)
--print('byte ' .. hex.to_hex(byte).. ' = ' .. hex.to_hex(tail_byte) .. '&'..hex.to_hex(TAIL_MASK))
if(tail_idx - i > 0) then
local sft = bit.blshift(byte, (tail_idx - i) * 6)
--print('shift ' .. hex.to_hex(sft) .. ' ' .. hex.to_hex(byte) .. ' ' .. ((tail_idx - i) * 6))
char = bit.bor(char, sft)
--print('char ' .. hex.to_hex(char))
else
char = byte
end
end -- tails
-- add head
local head_val = bit.band(head_byte, bit.bnot(mask))
--print('head val ' .. hex.to_hex(head_val))
head_val = bit.blshift(head_val, (len-1) * 6)
--print('head val ' .. hex.to_hex(head_val))
char = bit.bor(head_val, char)
--print('char ' .. hex.to_hex(char))
return char, len
end -- if head match
end -- for mask
error('not find proper head mask')
end
local function print_hex(str)
local cat = ''
for i=1, string.len(str) do
cat = cat .. ' ' .. hex.to_hex(string.byte(str, i))
end
print(cat)
end
local HI_MASK = hex.to_dec('0xF0')
local LO_MASK = hex.to_dec('0xFF')
local function char_to_str(char)
local hi, lo = bit.brshift(char, 8), bit.band(char, LO_MASK)
-- print(hex.to_hex(char)..' '..hex.to_hex(hi)..' ' .. hex.to_hex(lo))
if(hi == 0) then
return string.format('%c\0', lo)
elseif(lo == 0) then
return string.format('\0%c', hi)
else
return string.format('%c%c', lo, hi)
end
end
local function utf_to_uni(utf)
local n = string.len(utf)
local i = 1
local uni = ''
while(i <= n) do
--print('---')
char, len = utf_read_char(utf, i)
i = i + len
--print(string.len(char_to_str(char)))
uni = uni..char_to_str(char)
end
--print_hex(uni)
return uni
end
-- interface
utf8 = {
utf_to_uni = utf_to_uni,
print_hex = print_hex,
}
end
--[[
-- test
byte_3 = string.format('%c%c%c', hex.to_dec('0xE7'), hex.to_dec('0x83'), hex.to_dec('0xad'))
print(string.len(byte_3))
utf8.utf_to_uni(byte_3)
--]]
--[[
byte_2 = string.format('%c%c', hex.to_dec('0xC2'), hex.to_dec('0x9D'))
utf8.utf_to_uni(byte_2)
byte_1 = string.format('%c', hex.to_dec('0xB'))
utf8.utf_to_uni(byte_1)
--]]
--[[
test_mul = string.format(
'%c%c%c%c%c%c%c%c%c',
hex.to_dec('0xE8'),hex.to_dec('0xAF'), hex.to_dec('0xBA'),
hex.to_dec('0xE5'),hex.to_dec('0x9F'), hex.to_dec('0xBA'),
hex.to_dec('0xE4'),hex.to_dec('0xBA'), hex.to_dec('0x9A'))
utf8.print_hex(utf8.utf_to_uni(test_mul))
--]]