-
Notifications
You must be signed in to change notification settings - Fork 8
/
codecs.dart
214 lines (190 loc) · 7.85 KB
/
codecs.dart
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
/** Decodes bytes using the correct name. See [decodeBytes]. */
#library('codecs');
#import('dart:utf');
#import('dart:io'); // for DecoderException
bool hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) {
int end = length != null ? offset + length : bytes.length;
return (offset + 3) <= end &&
bytes[offset] == 0xEF &&
bytes[offset + 1] == 0xBB &&
bytes[offset + 2] == 0xBF;
}
// TODO(jmesserly): it's unfortunate that this has to be one-shot on the entire
// file, but dart:utf does not expoes stream-based decoders yet.
/**
* Decodes the [bytes] with the provided [encoding] and returns an interator for
* the codepoints. Supports the major unicode encodings as well as ascii and
* and windows-1252 encodings.
*/
Iterable<int> decodeBytes(String encoding, List<int> bytes,
[int offset = 0, int length,
int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
if (length == null) length = bytes.length;
final replace = replacementCodepoint;
switch (encoding) {
case 'ascii':
bytes = bytes.getRange(offset, length);
// TODO(jmesserly): this was taken from runtime/bin/string_stream.dart
for (int byte in bytes) {
if (byte > 127) {
throw new DecoderException("Illegal ASCII character $byte");
}
}
return bytes;
case 'windows-1252':
case 'cp1252':
return decodeWindows1252AsIterable(bytes, offset, length, replace);
case 'utf-8':
// NOTE: to match the behavior of the other decode functions, we eat the
// utf-8 BOM here.
if (hasUtf8Bom(bytes, offset, length)) {
offset += 3;
length -= 3;
}
return decodeUtf8AsIterable(bytes, offset, length, replace);
case 'utf-16':
return decodeUtf16AsIterable(bytes, offset, length, replace);
case 'utf-16-be':
return decodeUtf16beAsIterable(bytes, offset, length, true, replace);
case 'utf-16-le':
return decodeUtf16leAsIterable(bytes, offset, length, true, replace);
case 'utf-32':
return decodeUtf32AsIterable(bytes, offset, length, replace);
case 'utf-32-be':
return decodeUtf32beAsIterable(bytes, offset, length, true, replace);
case 'utf-32-le':
return decodeUtf32leAsIterable(bytes, offset, length, true, replace);
default:
throw new IllegalArgumentException('Encoding $encoding not supported');
}
}
/**
* Given a UCS-2 string which may contain UTF-16 surrogate pairs, converts to
* a correctly encoded Dart string. If the [input] string does not contain
* surrogate pairs, that string instance will be returned unmodified.
*
* This is useful for fixing strings returned by [JSON.parse], if the JSON
* has UTF-16 encoded via surrogate pairs of characters. For example,
* `"\ud835\udd04"` should translate to a one character stirng with the code
* point `0x01d504`.
*/
String decodeUtf16Surrogates(String input) {
// Note: don't allocate anything until we know we we need it.
List<int> newCodes = null;
for (int i = 0; i < input.length; i++) {
var c = input.charCodeAt(i);
if (0xD800 <= c && c <= 0xDBFF) {
int next = i + 1;
if (next < input.length) {
var d = input.charCodeAt(next);
if (0xDC00 <= d && d <= 0xDFFF) {
if (newCodes == null) {
newCodes = <int>[];
for (int j = 0; j < i; j++) newCodes.add(input.charCodeAt(j));
}
c = 0x10000 + ((c - 0xD800) << 10) + (d - 0xDC00);
i = next;
}
}
}
if (newCodes != null) newCodes.add(c);
}
if (newCodes == null) return input;
return codepointsToString(newCodes);
}
/**
* Decodes [windows-1252](http://en.wikipedia.org/wiki/Windows-1252) bytes as an
* iterable. Thus, the consumer can only convert as much of the input as needed.
* Set the [replacementCharacter] to null to throw an [IllegalArgumentException]
* rather than replace the bad value.
*/
IterableWindows1252Decoder decodeWindows1252AsIterable(List<int> bytes,
[int offset = 0, int length,
int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
return new IterableWindows1252Decoder(bytes, offset, length,
replacementCodepoint);
}
/**
* Return type of [decodeWindows1252AsIterable] and variants. The Iterable type
* provides an iterator on demand and the iterator will only translate bytes
* as requested by the user of the iterator. (Note: results are not cached.)
*/
class IterableWindows1252Decoder implements Iterable<int> {
final List<int> bytes;
final int offset;
final int length;
final int replacementCodepoint;
IterableWindows1252Decoder(List<int> this.bytes, [int this.offset = 0,
int this.length = null,
int this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);
Windows1252Decoder iterator() => new Windows1252Decoder(bytes, offset, length,
replacementCodepoint);
}
/**
* Provides an iterator of Unicode codepoints from windows-1252 encoded bytes.
* The parameters can set an offset into a list of bytes (as int), limit the
* length of the values to be decoded, and override the default Unicode
* replacement character. Set the replacementCharacter to null to throw an
* IllegalArgumentException rather than replace the bad value. The return value
* from this method can be used as an Iterable (e.g. in a for-loop).
*/
class Windows1252Decoder implements Iterator<int> {
final int replacementCodepoint;
final List<int> _bytes;
int _offset;
final int _length;
Windows1252Decoder(List<int> bytes, [int offset = 0, int length,
this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT])
: _bytes = bytes,
_offset = offset,
_length = length == null ? bytes.length : length;
bool hasNext() => _offset < _length;
int next() {
if (!hasNext()) throw const NoMoreElementsException();
return _mapChar(_bytes[_offset++]);
}
int _mapChar(int char) {
// TODO(jmesserly): this is duplicating entitiesWindows1252 and
// replacementCharacters from constants.dart
switch (char) {
case 0x80: return 0x20AC; // EURO SIGN
case 0x82: return 0x201A; // SINGLE LOW-9 QUOTATION MARK
case 0x83: return 0x0192; // LATIN SMALL LETTER F WITH HOOK
case 0x84: return 0x201E; // DOUBLE LOW-9 QUOTATION MARK
case 0x85: return 0x2026; // HORIZONTAL ELLIPSIS
case 0x86: return 0x2020; // DAGGER
case 0x87: return 0x2021; // DOUBLE DAGGER
case 0x88: return 0x02C6; // MODIFIER LETTER CIRCUMFLEX ACCENT
case 0x89: return 0x2030; // PER MILLE SIGN
case 0x8A: return 0x0160; // LATIN CAPITAL LETTER S WITH CARON
case 0x8B: return 0x2039; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
case 0x8C: return 0x0152; // LATIN CAPITAL LIGATURE OE
case 0x8E: return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON
case 0x91: return 0x2018; // LEFT SINGLE QUOTATION MARK
case 0x92: return 0x2019; // RIGHT SINGLE QUOTATION MARK
case 0x93: return 0x201C; // LEFT DOUBLE QUOTATION MARK
case 0x94: return 0x201D; // RIGHT DOUBLE QUOTATION MARK
case 0x95: return 0x2022; // BULLET
case 0x96: return 0x2013; // EN DASH
case 0x97: return 0x2014; // EM DASH
case 0x98: return 0x02DC; // SMALL TILDE
case 0x99: return 0x2122; // TRADE MARK SIGN
case 0x9A: return 0x0161; // LATIN SMALL LETTER S WITH CARON
case 0x9B: return 0x203A; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
case 0x9C: return 0x0153; // LATIN SMALL LIGATURE OE
case 0x9E: return 0x017E; // LATIN SMALL LETTER Z WITH CARON
case 0x9F: return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS
case 0x81:
case 0x8D:
case 0x8F:
case 0x90:
case 0x9D:
if (replacementCodepoint == null) {
throw new IllegalArgumentException(
"Invalid windows-1252 code point $char at $_offset");
}
return replacementCodepoint;
}
return char;
}
}