-
Notifications
You must be signed in to change notification settings - Fork 84
/
speech_lexicon.py
97 lines (71 loc) · 2.56 KB
/
speech_lexicon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2013, 2014, 2016 Guenter Bartsch
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import codecs
from nltools.phonetics import _normalize, IPA_normalization
#
# Lexicon load/save abstraction
#
DICT_PATH = 'data/src/dicts/%s'
class Lexicon(object):
def __init__(self, file_name):
"""Load a lexicon
:param file_name: E.g. dict-de.ipa or dict-en.ipa.
"""
self.file_name = file_name
self.dictionary = {}
self.multidict = {}
with open(DICT_PATH % self.file_name, 'r') as f:
while True:
line = f.readline().rstrip().decode('utf8')
if not line:
break
parts = line.split(';')
# print repr(parts)
ipas = _normalize (parts[1], IPA_normalization)
k = parts[0]
v = {'ipa': ipas}
self.dictionary[k] = v
b = k.split('_')[0]
if not b in self.multidict:
self.multidict[b] = {}
self.multidict[b][k] = v
def __len__(self):
return len(self.dictionary)
def __getitem__(self, key):
return self.dictionary[key]
def __iter__(self):
return iter(sorted(self.dictionary))
def __setitem__(self, k, v):
self.dictionary[k] = v
b = k.split('_')[0]
if not b in self.multidict:
self.multidict[b] = {}
self.multidict[b][k] = v
def __contains__(self, key):
return key in self.dictionary
def get_multi(self, k):
b = k.split('_')[0]
return self.multidict[b]
def save(self):
with codecs.open(DICT_PATH % self.file_name, 'w', 'utf8') as f:
for w in sorted(self.dictionary):
entry = self.dictionary[w]
f.write(u"%s;%s\n" % (w, entry['ipa']))
def remove(self, key):
del self.dictionary[key]