Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing encoding errors on windows #56

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,6 @@ target/
.vscode/settings.json
.DS_Store
.gitignore


venv
4 changes: 2 additions & 2 deletions panphon/_panphon.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def _read_table(self, filename):
__name__, filename)
segments = []
with open(filename, 'rb') as f:
reader = csv.reader(f, encoding='utf-8')
reader = csv.reader(f)
header = next(reader)
names = header[1:]
for row in reader:
Expand All @@ -155,7 +155,7 @@ def _read_weights(self, filename=os.path.join('data', 'feature_weights.csv')):
filename = pkg_resources.resource_filename(
__name__, filename)
with open(filename, 'rb') as f:
reader = csv.reader(f, encoding='utf-8')
reader = csv.reader(f)
next(reader)
weights = [float(x) for x in next(reader)]
return weights
Expand Down
4 changes: 2 additions & 2 deletions panphon/bin/align_wordlists.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ def score(indices):

def main(wordlist1, wordlist2, dist_funcs):
with open(wordlist1, 'rb') as file_a, open(wordlist2, 'rb') as file_b:
reader_a = csv.reader(file_a, encoding='utf-8')
reader_b = csv.reader(file_b, encoding='utf-8')
reader_a = csv.reader(file_a)
reader_b = csv.reader(file_b)
print('Reading word lists...')
words = zip([(w, g) for (g, w) in reader_a],
[(w, g) for (g, w) in reader_b])
Expand Down
4 changes: 2 additions & 2 deletions panphon/bin/generate_ipa_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def parse_dia_defs(dia_defs):

def sort_all_segments(sort_order, all_segments):
all_segments_list = list(all_segments)
field_order = reversed(yaml.load(open(sort_order, 'r').read(), Loader=yaml.FullLoader))
field_order = reversed(yaml.load(open(sort_order, 'r', encoding='utf-8').read(), Loader=yaml.FullLoader))
for field in field_order:
all_segments_list.sort(key=lambda seg: seg.features[field['name']],
reverse=field['reverse'])
Expand All @@ -143,7 +143,7 @@ def sort_all_segments(sort_order, all_segments):

def write_ipa_all(ipa_bases, ipa_all, all_segments, sort_order):
with open(ipa_bases, 'rb') as f:
reader = csv.reader(f, encoding='utf-8')
reader = csv.reader(f)
fieldnames = next(reader)
with open(ipa_all, 'wb') as f:
writer = csv.DictWriter(f, encoding='utf-8', fieldnames=fieldnames)
Expand Down
2 changes: 1 addition & 1 deletion panphon/collapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def __init__(self, tablename='dogolpolsky_prime.yml', feature_set='spe+', featur
def _load_table(self, tablename):
fn = os.path.join('data', tablename)
fn = pkg_resources.resource_filename(__name__, fn)
with open(fn, 'r') as f:
with open(fn, 'r', encoding='utf-8') as f:
rules = []
table = yaml.load(f.read(), Loader=yaml.FullLoader)
for rule in table:
Expand Down
2 changes: 1 addition & 1 deletion panphon/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def _dolgopolsky_prime(self, filename=os.path.join('data', 'dolgopolsky_prime.ym
"""
filename = pkg_resources.resource_filename(
__name__, filename)
with open(filename, 'r') as f:
with open(filename, 'r', encoding='utf-8') as f:
rules = []
dolgo_prime = yaml.load(f.read(), Loader=yaml.FullLoader)
for rule in dolgo_prime:
Expand Down
24 changes: 11 additions & 13 deletions panphon/featuretable.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,22 +23,23 @@
}

class SegmentSorter:
def __init__(self, segments,):
def __init__(self, segments):
self._segments = segments
self._sorted=False
self._sorted = False

@property
def segments(self):
if not self._sorted:
self.sort_segments()
self._sort_segments()
return self._segments

def sort_segments(self):
self.segments.sort(key=self.segment_key)
def _sort_segments(self):
self._segments.sort(key=self.segment_key)
self._sorted = True

@staticmethod
def segment_key(segment_tuple):
segment_data=segment_tuple[1]
segment_data = segment_tuple[1]
return (
segment_data['syl'], segment_data['son'], segment_data['cons'], segment_data['cont'],
segment_data['delrel'], segment_data['lat'], segment_data['nas'], segment_data['strid'],
Expand All @@ -65,7 +66,7 @@ def __init__(self, feature_set: str='spe+'):
self.longest_seg = max([len(x) for x in self.seg_dict.keys()])
self.xsampa = xsampa.XSampa()

self.sorted_segments = SegmentSorter(self.segments) #used for quick binary searches
self.sorted_segments = SegmentSorter(self.segments)



Expand All @@ -76,7 +77,7 @@ def normalize(data: str) -> str:
def _read_bases(self, fn: str, weights):
fn = pkg_resources.resource_filename(__name__, fn)
segments = []
with open(fn) as f:
with open(fn, encoding='utf-8') as f:
reader = csv.reader(f)
header = next(reader)
names = header[1:]
Expand All @@ -92,7 +93,7 @@ def _read_bases(self, fn: str, weights):

def _read_weights(self, weights_fn: str) -> list[float]:
weights_fn = pkg_resources.resource_filename(__name__, weights_fn)
with open(weights_fn) as f:
with open(weights_fn, encoding='utf-8') as f:
reader = csv.reader(f)
next(reader)
weights = [float(x) for x in next(reader)]
Expand Down Expand Up @@ -542,18 +543,15 @@ def _binary_search(self, segment_list, target, fuzzy_search=False):
high = mid - 1

if best_match_index is None and fuzzy_search:
# Used for fuzzy searching
best_match_index = mid

if best_match_index is not None:
# Check neighboring rows within the range of +-5
best_match = segment_list[best_match_index]
for offset in range(-9, 5):
neighbor_index = best_match_index + offset
if 0 <= neighbor_index < len(segment_list):
neighbor_segment = segment_list[neighbor_index]
if not self._compare_vectors(self.sorted_segments.segment_key(neighbor_segment),target):
# Check if the neighbor segment has a shorter name
if not self._compare_vectors(self.sorted_segments.segment_key(neighbor_segment), target):
if len(neighbor_segment[0]) < len(best_match[0]):
best_match = neighbor_segment
return best_match[0]
Expand Down
4 changes: 2 additions & 2 deletions panphon/permissive.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def __init__(self,
def _read_ipa_bases(self, fn):
fn = pkg_resources.resource_filename(__name__, fn)
with open(fn, 'rb') as f:
reader = csv.reader(f, encoding='utf-8', delimiter=str(','))
reader = csv.reader(f, delimiter=str(','))
names = next(reader)[1:]
bases = {}
for row in reader:
Expand Down Expand Up @@ -93,7 +93,7 @@ def _read_weights(self, filename=os.path.join('data', 'feature_weights.csv')):
filename = pkg_resources.resource_filename(
__name__, filename)
with open(filename, 'rb') as f:
reader = csv.reader(f, encoding='utf-8')
reader = csv.reader(f)
next(reader)
weights = [float(x) for x in next(reader)]
return weights
Expand Down
2 changes: 1 addition & 1 deletion panphon/xsampa.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def read_xsampa_table(self):
filename = os.path.join('data', 'ipa-xsampa.csv')
filename = pkg_resources.resource_filename(__name__, filename)
with open(filename, 'rb') as f:
xs2ipa = {x[1]: x[0] for x in csv.reader(f, encoding='utf-8')}
xs2ipa = {x[1]: x[0] for x in csv.reader(f)}
xs = sorted(xs2ipa.keys(), key=len, reverse=True)
xs_regex = re.compile('|'.join(list(map(re.escape, xs))))
return xs_regex, xs2ipa
Expand Down