-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.cpp
129 lines (110 loc) · 2.86 KB
/
tokenizer.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#include "tokenizer.hpp"
#include <string.h>
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
const wchar_t *UTF8Tokenizer::flush() {
if (index == 0) return NULL;
memset(result, 0, sizeof(result));
int i = 0;
for (; i < index; ++i) result[i] = L'?';
state = index = 0;
return result;
}
const wchar_t *UTF8Tokenizer::add(unsigned char chr) {
if (state == 0) { // Reading the first byte of a character.
memset(token, 0, sizeof(token));
// Got a later byte, which is unexpected here.
if ((chr & 0xC0) == 0x80) {
memset(result, 0, sizeof(result));
result[0] = L'?';
return result;
}
// ASCII
if (!(chr & 0x80)) {
memset(result, 0, sizeof(result));
result[0] = chr;
return result;
}
token[0] = chr;
char tmp = chr << 2;
int i = 1;
for (; true; ++i) {
if (!(tmp & 0x80)) {
state = i;
break;
}
if (i == 6) {
state = 7;
break;
}
tmp <<= 1;
}
++index;
} else { // Reading a later byte of a character.
// Got a first byte, which is unexpected here.
if ((chr & 0xC0) != 0x80) {
memset(result, 0, sizeof(result));
int i = 0;
for (; i < index; ++i) result[i] = L'?';
state = index = 0;
return result;
}
token[index++] = chr;
--state;
// Completed one character.
if (state == 0) {
memset(result, 0, sizeof(result));
MultiByteToWideChar(CP_UTF8, 0, (char *)token, -1, result, sizeof(result) / sizeof(result[0]));
index = 0;
return result;
}
}
return NULL;
}
const wchar_t *SJISTokenizer::flush() {
if (token == 0) return NULL;
memset(result, 0, sizeof(result));
result[0] = L'?';
token = 0;
return result;
}
const wchar_t *SJISTokenizer::add(unsigned char chr) {
// Reading the first byte of a character.
if (token == 0) {
// Invalid
if (chr == 0x80 || chr == 0xA0 || chr >= 0xFD) {
memset(result, 0, sizeof(result));
result[0] = L'?';
return result;
}
// ASCII
if (!(chr & 0x80)) {
memset(result, 0, sizeof(result));
result[0] = chr;
return result;
}
// Kana
if (chr >= 0xA1 && chr <= 0xDF) {
memset(result, 0, sizeof(result));
MultiByteToWideChar(932, 0, (char *)&token, 1, result, sizeof(result) / sizeof(result[0]));
return result;
}
token = chr;
return NULL;
}
// Reading the second byte of a character.
if (chr <= 0x3f || chr == 0x7F || chr >= 0xFD) { // Invalid
memset(result, 0, sizeof(result));
result[0] = result[1] = L'?';
token = 0;
return result;
}
// Completed one character.
const unsigned char character[2] = {token, chr};
memset(result, 0, sizeof(result));
MultiByteToWideChar(932, 0, (const char *)character, 2, result, sizeof(result) / sizeof(result[0]));
token = 0;
return result;
}