From 026577b635789d67cced478cf650766bb638ea69 Mon Sep 17 00:00:00 2001 From: Ryan Heuser Date: Sun, 4 Aug 2024 12:47:42 +0100 Subject: [PATCH 1/2] encoding errors? --- panphon/bin/generate_ipa_all.py | 2 +- panphon/collapse.py | 2 +- panphon/distance.py | 2 +- panphon/featuretable.py | 4 ++-- panphon/xsampa.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/panphon/bin/generate_ipa_all.py b/panphon/bin/generate_ipa_all.py index 7f19b77..a2230f7 100644 --- a/panphon/bin/generate_ipa_all.py +++ b/panphon/bin/generate_ipa_all.py @@ -134,7 +134,7 @@ def parse_dia_defs(dia_defs): def sort_all_segments(sort_order, all_segments): all_segments_list = list(all_segments) - field_order = reversed(yaml.load(open(sort_order, 'r').read(), Loader=yaml.FullLoader)) + field_order = reversed(yaml.load(open(sort_order, 'r', encoding='utf-8').read(), Loader=yaml.FullLoader)) for field in field_order: all_segments_list.sort(key=lambda seg: seg.features[field['name']], reverse=field['reverse']) diff --git a/panphon/collapse.py b/panphon/collapse.py index 0303983..921403c 100644 --- a/panphon/collapse.py +++ b/panphon/collapse.py @@ -20,7 +20,7 @@ def __init__(self, tablename='dogolpolsky_prime.yml', feature_set='spe+', featur def _load_table(self, tablename): fn = os.path.join('data', tablename) fn = pkg_resources.resource_filename(__name__, fn) - with open(fn, 'r') as f: + with open(fn, 'r', encoding='utf-8') as f: rules = [] table = yaml.load(f.read(), Loader=yaml.FullLoader) for rule in table: diff --git a/panphon/distance.py b/panphon/distance.py index 37c290a..5c646d7 100644 --- a/panphon/distance.py +++ b/panphon/distance.py @@ -67,7 +67,7 @@ def _dolgopolsky_prime(self, filename=os.path.join('data', 'dolgopolsky_prime.ym """ filename = pkg_resources.resource_filename( __name__, filename) - with open(filename, 'r') as f: + with open(filename, 'r', encoding='utf-8') as f: rules = [] dolgo_prime = yaml.load(f.read(), Loader=yaml.FullLoader) for rule in dolgo_prime: diff --git a/panphon/featuretable.py b/panphon/featuretable.py index 0800b79..df88c52 100644 --- a/panphon/featuretable.py +++ b/panphon/featuretable.py @@ -76,7 +76,7 @@ def normalize(data: str) -> str: def _read_bases(self, fn: str, weights): fn = pkg_resources.resource_filename(__name__, fn) segments = [] - with open(fn) as f: + with open(fn, encoding='utf-8') as f: reader = csv.reader(f) header = next(reader) names = header[1:] @@ -92,7 +92,7 @@ def _read_bases(self, fn: str, weights): def _read_weights(self, weights_fn: str) -> list[float]: weights_fn = pkg_resources.resource_filename(__name__, weights_fn) - with open(weights_fn) as f: + with open(weights_fn, encoding='utf-8') as f: reader = csv.reader(f) next(reader) weights = [float(x) for x in next(reader)] diff --git a/panphon/xsampa.py b/panphon/xsampa.py index 60ec9f1..35314e7 100644 --- a/panphon/xsampa.py +++ b/panphon/xsampa.py @@ -14,7 +14,7 @@ def __init__(self, delimiter=' '): def read_xsampa_table(self): filename = os.path.join('data', 'ipa-xsampa.csv') filename = pkg_resources.resource_filename(__name__, filename) - with open(filename, 'rb') as f: + with open(filename, 'rb', encoding='utf-8') as f: xs2ipa = {x[1]: x[0] for x in csv.reader(f, encoding='utf-8')} xs = sorted(xs2ipa.keys(), key=len, reverse=True) xs_regex = re.compile('|'.join(list(map(re.escape, xs)))) From 5e702b7410c8244f8afe558b1c367acb3c037314 Mon Sep 17 00:00:00 2001 From: Ryan Heuser Date: Sun, 4 Aug 2024 12:55:33 +0100 Subject: [PATCH 2/2] fixing encoding errors on windows --- .gitignore | 3 +++ panphon/_panphon.py | 4 ++-- panphon/bin/align_wordlists.py | 4 ++-- panphon/bin/generate_ipa_all.py | 2 +- panphon/featuretable.py | 20 +++++++++----------- panphon/permissive.py | 4 ++-- panphon/xsampa.py | 4 ++-- 7 files changed, 21 insertions(+), 20 deletions(-) diff --git a/.gitignore b/.gitignore index 0dd93cb..4d92e51 100644 --- a/.gitignore +++ b/.gitignore @@ -63,3 +63,6 @@ target/ .vscode/settings.json .DS_Store .gitignore + + +venv \ No newline at end of file diff --git a/panphon/_panphon.py b/panphon/_panphon.py index df571f7..5daa16e 100644 --- a/panphon/_panphon.py +++ b/panphon/_panphon.py @@ -140,7 +140,7 @@ def _read_table(self, filename): __name__, filename) segments = [] with open(filename, 'rb') as f: - reader = csv.reader(f, encoding='utf-8') + reader = csv.reader(f) header = next(reader) names = header[1:] for row in reader: @@ -155,7 +155,7 @@ def _read_weights(self, filename=os.path.join('data', 'feature_weights.csv')): filename = pkg_resources.resource_filename( __name__, filename) with open(filename, 'rb') as f: - reader = csv.reader(f, encoding='utf-8') + reader = csv.reader(f) next(reader) weights = [float(x) for x in next(reader)] return weights diff --git a/panphon/bin/align_wordlists.py b/panphon/bin/align_wordlists.py index e84de64..91de9e6 100644 --- a/panphon/bin/align_wordlists.py +++ b/panphon/bin/align_wordlists.py @@ -44,8 +44,8 @@ def score(indices): def main(wordlist1, wordlist2, dist_funcs): with open(wordlist1, 'rb') as file_a, open(wordlist2, 'rb') as file_b: - reader_a = csv.reader(file_a, encoding='utf-8') - reader_b = csv.reader(file_b, encoding='utf-8') + reader_a = csv.reader(file_a) + reader_b = csv.reader(file_b) print('Reading word lists...') words = zip([(w, g) for (g, w) in reader_a], [(w, g) for (g, w) in reader_b]) diff --git a/panphon/bin/generate_ipa_all.py b/panphon/bin/generate_ipa_all.py index a2230f7..fa1ed6c 100644 --- a/panphon/bin/generate_ipa_all.py +++ b/panphon/bin/generate_ipa_all.py @@ -143,7 +143,7 @@ def sort_all_segments(sort_order, all_segments): def write_ipa_all(ipa_bases, ipa_all, all_segments, sort_order): with open(ipa_bases, 'rb') as f: - reader = csv.reader(f, encoding='utf-8') + reader = csv.reader(f) fieldnames = next(reader) with open(ipa_all, 'wb') as f: writer = csv.DictWriter(f, encoding='utf-8', fieldnames=fieldnames) diff --git a/panphon/featuretable.py b/panphon/featuretable.py index df88c52..dad83dc 100644 --- a/panphon/featuretable.py +++ b/panphon/featuretable.py @@ -23,22 +23,23 @@ } class SegmentSorter: - def __init__(self, segments,): + def __init__(self, segments): self._segments = segments - self._sorted=False + self._sorted = False @property def segments(self): if not self._sorted: - self.sort_segments() + self._sort_segments() return self._segments - def sort_segments(self): - self.segments.sort(key=self.segment_key) + def _sort_segments(self): + self._segments.sort(key=self.segment_key) + self._sorted = True @staticmethod def segment_key(segment_tuple): - segment_data=segment_tuple[1] + segment_data = segment_tuple[1] return ( segment_data['syl'], segment_data['son'], segment_data['cons'], segment_data['cont'], segment_data['delrel'], segment_data['lat'], segment_data['nas'], segment_data['strid'], @@ -65,7 +66,7 @@ def __init__(self, feature_set: str='spe+'): self.longest_seg = max([len(x) for x in self.seg_dict.keys()]) self.xsampa = xsampa.XSampa() - self.sorted_segments = SegmentSorter(self.segments) #used for quick binary searches + self.sorted_segments = SegmentSorter(self.segments) @@ -542,18 +543,15 @@ def _binary_search(self, segment_list, target, fuzzy_search=False): high = mid - 1 if best_match_index is None and fuzzy_search: - # Used for fuzzy searching best_match_index = mid if best_match_index is not None: - # Check neighboring rows within the range of +-5 best_match = segment_list[best_match_index] for offset in range(-9, 5): neighbor_index = best_match_index + offset if 0 <= neighbor_index < len(segment_list): neighbor_segment = segment_list[neighbor_index] - if not self._compare_vectors(self.sorted_segments.segment_key(neighbor_segment),target): - # Check if the neighbor segment has a shorter name + if not self._compare_vectors(self.sorted_segments.segment_key(neighbor_segment), target): if len(neighbor_segment[0]) < len(best_match[0]): best_match = neighbor_segment return best_match[0] diff --git a/panphon/permissive.py b/panphon/permissive.py index 91ede62..7513eb8 100644 --- a/panphon/permissive.py +++ b/panphon/permissive.py @@ -58,7 +58,7 @@ def __init__(self, def _read_ipa_bases(self, fn): fn = pkg_resources.resource_filename(__name__, fn) with open(fn, 'rb') as f: - reader = csv.reader(f, encoding='utf-8', delimiter=str(',')) + reader = csv.reader(f, delimiter=str(',')) names = next(reader)[1:] bases = {} for row in reader: @@ -93,7 +93,7 @@ def _read_weights(self, filename=os.path.join('data', 'feature_weights.csv')): filename = pkg_resources.resource_filename( __name__, filename) with open(filename, 'rb') as f: - reader = csv.reader(f, encoding='utf-8') + reader = csv.reader(f) next(reader) weights = [float(x) for x in next(reader)] return weights diff --git a/panphon/xsampa.py b/panphon/xsampa.py index 35314e7..ad07a9f 100644 --- a/panphon/xsampa.py +++ b/panphon/xsampa.py @@ -14,8 +14,8 @@ def __init__(self, delimiter=' '): def read_xsampa_table(self): filename = os.path.join('data', 'ipa-xsampa.csv') filename = pkg_resources.resource_filename(__name__, filename) - with open(filename, 'rb', encoding='utf-8') as f: - xs2ipa = {x[1]: x[0] for x in csv.reader(f, encoding='utf-8')} + with open(filename, 'rb') as f: + xs2ipa = {x[1]: x[0] for x in csv.reader(f)} xs = sorted(xs2ipa.keys(), key=len, reverse=True) xs_regex = re.compile('|'.join(list(map(re.escape, xs)))) return xs_regex, xs2ipa