-
Notifications
You must be signed in to change notification settings - Fork 4
/
gene.py
executable file
·474 lines (425 loc) · 14.4 KB
/
gene.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
#!/usr/bin/python -OOS
# coding=utf-8
from sys import exit
from sys import stderr
import argparse
parser = argparse.ArgumentParser(
description='Generate code book for Chinese input method')
parser.add_argument('-a', '--alternative_map',
help='alternatively map groups to keys based on shape resemblance',
action='store_true')
parser.add_argument('-b', '--breakdown_file',
help='input file breaking characters down to roots',
default='breakdown.txt')
parser.add_argument('-g', '--group_file',
help='input file grouping roots',
default='group.txt')
parser.add_argument('-c', '--code_file',
help='output codes to this file',
default='code.txt')
parser.add_argument('-f', '--freq_file',
help='output root frequencies to this file',
default='freq.txt')
parser.add_argument('-m', '--max_code_length',
help='maximum code length',
type=int, default=4)
parser.add_argument('-p', '--pad_with_pin_yin',
help='add Pin-Yin for code length below the maximum',
type=int, default=2)
parser.add_argument('-s', '--simple_code_length',
help='generate simple codes up to this length',
type=int, default=3)
parser.add_argument('-o', '--optimize',
help='optimize for root groups, '
'taking "all" or a comma-separated list of roots')
parser.add_argument('-n', '--new_group_file',
help='output optimized groups to this file',
default='group')
args = parser.parse_args()
with open(args.breakdown_file) as f:
lines = f.readlines()
for i in range(len(lines)):
if lines[i].strip() == '#START':
break
else:
print 'The character table must start with a "#START" line'
exit(1)
#an B0B6 岸 山(厂干ZS)SX
#ai DEDF 捱 扌[涯右]
#biao ECAE 飚 风(火(火火)SX)ZX
#sui CBE8 髓 骨(辶(左月SX)ZX)
def parse_breakdown(line, i):
breakdown = []
while i < len(line):
if ord(line[i]) >= 240: # 4-byte character
breakdown += [line[i:i+4]]
i += 4
elif ord(line[i]) >= 224: # 3-byte character
breakdown += [line[i:i+3]]
i += 3
elif line[i] == '[': # non-character
j = i + 1
while line[j] != ']':
j += 1
if j >= len(line):
return '', i, 'mismatched ['
breakdown += [line[i:j+1]]
i = j + 1
elif line[i] == '(': # structure
part, i, err = parse_breakdown(line, i+1)
if err != '':
return part, i, err
assert line[i] == ')', 'mismatched ('
i += 1
breakdown += [part]
elif line[i] == ')': # structure
return breakdown, i, ''
elif line[i] == '?': # unknown
return '', i, 'unknown breakdown'
elif line[i].isalpha() or line[i] in ['{', '}']:
i += 1
else:
return '', i, 'invalid character %s' % line[i]
return breakdown, i, ''
def reverse_order(breakdown, special_roots):
if breakdown[0] in special_roots:
breakdown = breakdown[1:] + breakdown[:1]
return [reverse_order(part, special_roots) if isinstance(part, list)
else part for part in breakdown]
def traverse(breakdown):
flat = []
for part in breakdown:
flat += traverse(part) if isinstance(part, list) else [part]
return flat
sp_map = {
'iu': 'q',
'en': 'w',
'e': 'e',
'zh': 'e',
'ou': 'r',
'ui': 't',
'ue': 't',
'ue:': 't',
'uan': 'y',
'u': 'u',
'sh': 'u',
'i': 'i',
'ch': 'i',
'o': 'o',
'uo': 'o',
'un': 'p',
'a': 'a',
'ao': 's',
'ang': 'd',
'eng': 'f',
'ng': 'f',
'ing': 'g',
'er': 'g',
'ong': 'h',
'iong': 'h',
'ai': 'j',
'an': 'k',
'ian': 'l',
'uai': 'l',
'iang': ';',
'uang': ';',
'ua': 'x',
'ia': 'x',
'ie': 'c',
'u:': 'v',
'in': 'b',
'iao': 'n',
'ei': 'm',
'': '',
}
def convert_py_to_sp(py):
if py[0] in 'a':
return 'a' + sp_map[py]
if py[0] in 'oe' or py == 'ng':
return 'o' + sp_map[py]
if py[0] in 'zcs' and py[1] == 'h':
return sp_map[py[0:2]] + sp_map[py[2:]]
return py[0:1] + sp_map[py[1:]]
lines = lines[i+1:]
dict = {}
pinyin = {}
incomplete = []
for line in lines:
line = line.strip()
# Skip empty line or comment.
if line == '' or line[0] == '#':
continue
# Skip a line that doesn't define a character.
item = line.split()
if len(item) < 3:
continue
# Extra Pin-Yin if it's a character.
p = item[0]
c = item[2]
if p[0].isalpha():
p = p[:args.pad_with_pin_yin]
if not pinyin.has_key(c):
pinyin[c] = [p]
elif not p in pinyin[c]:
pinyin[c] += [p]
# Handle a character whose breakdown is not defined. Can be a root.
if len(item) < 4:
if not dict.has_key(c):
dict[c] = ''
continue
# Merge the rest of the items into one string that defines the breakdown.
d = item[3]
for i in range(4, len(item)):
d += item[i]
breakdown, i, err = parse_breakdown(d, 0)
if err != '':
stderr.write('%s at: %s\n' % (err, line))
exit(1)
if i != len(d):
stderr.write('Incomplete parse at: %s\n' % line)
exit(1)
breakdown = reverse_order(breakdown, ['辶' , '廴'])
breakdown = traverse(breakdown)
# Mark a reference to the part if it's not yet defined.
for part in breakdown:
if not dict.has_key(part):
dict[part] = ''
# The character has been seen before.
if dict.has_key(c) and dict[c] != '':
if breakdown != dict[c]:
print 'Different breakdown for %s: %s, %s.' % (c,
''.join(dict[c]), ''.join(breakdown))
continue
dict[c] = breakdown
new_group = True
num_groups = 0
root_group = {} # map from roots to their group indexes
group_rep = [] # roots representing each group
dup_roots = [] # roots defined multiple times
# Read groups from group file.
with open(args.group_file) as f:
lines = f.readlines()
for line in lines:
if line[0] == '#':
continue
roots = line.split()
if roots == []:
new_group = True
continue
if new_group:
num_groups += 1
for root in roots[1:]:
# Skip frequency at the end if any.
if root[0] in '1234567890':
continue
if root_group.has_key(root):
dup_roots += [root]
continue
root_group[root] = num_groups - 1
if new_group:
group_rep += [root]
new_group = False
# Fail when there are undefined roots.
undefined_roots = [c for c, roots in dict.items()
if roots == '' and not root_group.has_key(c)]
if undefined_roots != []:
stderr.write('Undefined roots: ' + ' '.join(undefined_roots) + '\n')
exit(1)
# Warn about duplicate roots.
if dup_roots != []:
stderr.write('Duplicate roots: ' + ' '.join(dup_roots) + '\n')
# Recursively break a character down to roots
def break_character(c):
if dict[c]=='':
return [c]
roots = []
for part in dict[c]:
if dict.has_key(part):
roots += break_character(part)
return roots
# Remove minor (single-stroke) roots since they carry less info.
minor_roots = ['一', '丨', '丿', '丶']
def remove_minor_roots(roots):
new_roots = roots[:1]
for root in roots[1:]:
if root not in minor_roots:
new_roots += [root]
return new_roots
# Generate flat dictionary where characters are broken down to roots.
flat_dict = {}
root_freq = {}
for c in dict.keys():
if c in incomplete or c[0] == '[':
continue
roots = break_character(c)
if len(roots) > args.max_code_length:
roots = remove_minor_roots(roots)
if len(roots) > args.max_code_length:
roots = roots[:args.max_code_length-1] + roots[-1:]
flat_dict[c] = roots
for root in roots:
if root_freq.has_key(root):
root_freq[root] += 1
else:
root_freq[root] = 1
# Output root frequencies to a file.
freq_list = [[root, freq] for root, freq in root_freq.items()]
freq_list.sort(key=lambda x: x[1], reverse=True)
with open(args.freq_file, 'w') as f:
for root, freq in freq_list:
f.write('%s\t%d\n' % (root, freq))
# Warn about unused roots.
unused_roots = [root for root in root_group if root not in root_freq.keys()]
if unused_roots != []:
stderr.write('Unused roots: ' + ' '.join(unused_roots) + '\n')
# Read table of character frequencies.
with open('char_freq.txt') as f:
lines = f.readlines()
char_freq = {}
for line in lines:
item = line.split()
char_freq[item[0]] = int(item[1])
for c in flat_dict:
if not char_freq.has_key(c):
char_freq[c] = 0
# Keys used for encoding roots.
#code_keys = 'abcdefghijklmnopqrstuvwxyz;1234567890'
code_keys = 'qwertyuiopasdfgzxcvhjkl;bnm1234567890'
if args.alternative_map:
code_keys = 'egljkaobqpzfthxsuryidvw;cnm1234567890'
# Generate the code book.
def generate_code_book():
for c, roots in flat_dict.items():
if not pinyin.has_key(c):
continue
code = ''
for root in roots:
code += code_keys[root_group[root]]
plus = pinyin[c]
for p in plus:
code_plus = (code + p)[:args.max_code_length]
if not code_book.has_key(code_plus):
code_book[code_plus] = [c]
elif not c in code_book[code_plus]:
code_book[code_plus] += [c]
# Generate simple codes based on character frequencies.
def generate_simple_codes():
for length in range(1, args.simple_code_length+1):
simple_code_candidates = {}
for code, characters in code_book.items():
if len(code) <= length:
continue
simple_code = code[0:length]
if length > 1 and code_book.has_key(simple_code):
available = True
for char in code_book[simple_code]:
if char not in char_simple_code:
available = False
break
if not available:
continue
if simple_code_candidates.has_key(simple_code):
simple_code_candidates[simple_code] += characters
else:
simple_code_candidates[simple_code] = [] + characters
for code, candidates in simple_code_candidates.items():
candidates.sort(key=lambda c: char_freq[c], reverse=True)
for c in candidates:
if c in char_simple_code:
new = True
for simple_code in char_simple_code[c]:
if code.startswith(simple_code):
new = False
break
if new:
char_simple_code[c] += [code]
simple_code_book[code] = c
break
else:
char_simple_code[c] = [code]
simple_code_book[code] = c
break
# Output code book to a file.
def output_code_book():
with open(args.code_file, 'w') as f:
for code in sorted(simple_code_book):
f.write('%4s\t%s\t1\n' % (code, simple_code_book[code]))
for code in sorted(code_book.keys()):
characters = code_book[code]
full_codes = []
for c in characters:
if c not in char_simple_code:
full_codes += [c]
full_codes.sort(key=lambda c: char_freq[c], reverse=True)
code_count = len(full_codes)
for c in full_codes:
f.write('%4s\t%s\t%d\n' % (code, c, code_count))
def count_dups():
num_dups = 0
for code in sorted(code_book.keys()):
characters = code_book[code]
full_codes = []
for c in characters:
if c not in char_simple_code:
full_codes += [c]
full_codes.sort(key=lambda c: char_freq[c], reverse=True)
code_count = len(full_codes)
# Don't count dups for code length 1 because many of them are
# non-characters.
if len(code) <= 1:
continue
if code_count >= 2:
num_dups += code_count
return num_dups
code_book = {}
generate_code_book()
simple_code_book = {}
char_simple_code = {}
generate_simple_codes()
output_code_book()
num_dups = count_dups()
print '编码键数: %d 重码字数: %d' % (num_groups, num_dups)
# See if optimization for root grouping is asked.
if args.optimize is None:
exit(0)
elif args.optimize == 'all':
roots_to_optimize = root_freq.keys()
else:
roots_to_optimize = list(set(args.optimize.split(',')) - set(unused_roots))
non_roots = list(set(roots_to_optimize) - set(root_group.keys()))
if non_roots != []:
stderr.write('Cannot optimize for non-root: %s\n' % ' '.join(non_roots))
exit(1)
# Evaluation to figure out duplicate codes.
def evalulate_dups():
global code_book
code_book = {}
generate_code_book()
global simple_code_book
global char_simple_code
simple_code_book = {}
char_simple_code = {}
generate_simple_codes()
return count_dups()
# Sort roots by frequency. Optimize for frequently used roots first.
roots_to_optimize.sort(key=lambda r: root_freq[r], reverse=True)
for root in roots_to_optimize:
min_dups = num_dups
min_group = root_group[root]
for target_group in range(num_groups):
root_group[root] = target_group
new_dups = evalulate_dups()
if len(roots_to_optimize) == 1:
print code_keys[target_group], group_rep[target_group], new_dups
if new_dups < min_dups:
min_dups = new_dups
min_group = target_group
root_group[root] = min_group
num_dups = min_dups
print root, '->', code_keys[min_group], group_rep[min_group], min_dups
# Output optimized groups to a file.
with open(args.new_group_file, 'w') as f:
for g in range(num_groups):
roots = [root for root, freq in freq_list if root_group[root] == g]
freqs = [freq for root, freq in freq_list if root_group[root] == g]
f.write('%c %s %d\n\n' % (code_keys[g], ' '.join(roots), sum(freqs)))