Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor context processing #152

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
67 changes: 23 additions & 44 deletions pyndl/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,9 @@ def ngrams_to_word(occurrences, n_chars, outfile, remove_duplicates=True):
if not ngrams or not occurrence:
continue
if remove_duplicates:
outfile.write("{}\t{}\n".format("_".join(set(ngrams)), occurrence))
else:
outfile.write("{}\t{}\n".format("_".join(ngrams), occurrence))
ngrams = set(ngrams)
occurrence = "_".join(set(occurrence.split("_")))
outfile.write("{}\t{}\n".format("_".join(ngrams), occurrence))


def process_occurrences(occurrences, outfile, *,
Expand Down Expand Up @@ -245,19 +245,16 @@ def gen_occurrences(words):
"""
if event_structure == 'consecutive_words':
occurrences = list()
cur_words = list()
ii = 0
while True:
if ii < len(words):
cur_words.append(words[ii])
if ii >= len(words) or ii >= number_of_words:
# remove the first word
cur_words = cur_words[1:]
# can't have more consecutive words than total words
length = min(number_of_words, len(words))
# slide window over list of words
for ii in range(1 - length, len(words)):
# no consecutive words before first word
start = max(ii, 0)
# no consecutive words after last word
end = min(ii + length, len(words))
# append (cues, outcomes) with empty outcomes
occurrences.append(("_".join(cur_words), ''))
ii += 1
if not cur_words:
break
occurrences.append(("_".join(words[start:end]), ""))
return occurrences
# for words = (A, B, C, D); before = 2, after = 1
# make: (B, A), (A_C, B), (A_B_D, C), (B_C, D)
Expand All @@ -274,6 +271,8 @@ def gen_occurrences(words):
elif event_structure == 'line':
# (cues, outcomes) with empty outcomes
return [('_'.join(words), ''), ]
else:
raise ValueError('gen_occurrences should be one of {"consecutive_words", "word_to_word", "line"}')

def process_line(line):
"""processes one line of text."""
Expand All @@ -294,13 +293,6 @@ def process_words(words):
cue_structure=cue_structure,
remove_duplicates=remove_duplicates)

def process_context(line):
"""called when a context boundary is found."""
if context_structure == 'document':
# remove document marker
line = context_pattern.sub("", line)
return line

with open(corpus_file, "rt") as corpus:
with gzip.open(event_file, "wt") as outfile:
outfile.write("cues\toutcomes\n")
Expand All @@ -319,29 +311,16 @@ def process_context(line):
process_words(words)
else:
if context_pattern.search(line) is not None:
# process the first context
context1, *contexts = context_pattern.split(line)
context1 = process_context(context1)

if context1.strip():
context1 = process_line(context1.strip())
words.extend(gen_words(context1))
process_words(words)
# process in between contexts
while len(contexts) > 1:
words = []
context1, *contexts = contexts
context1 = process_context(context1)
if context1.strip():
context1 = process_line(context1.strip())
words.extend(gen_words(context1))
contexts = context_pattern.split(line)

# process contexts; only extend words on last context
for jj, context in enumerate(contexts):
context = process_line(context.strip())
words.extend(gen_words(context))
if jj < len(contexts):
process_words(words)
# add last part to next context
context1 = contexts[0]
context1 = process_context(context1)
if context1.strip():
context1 = process_line(context1.strip())
words.extend(gen_words(context1))
words = []

else:
line = process_line(line)
words.extend(gen_words(line))
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ max-line-length = 120

[pylint]
max-line-length = 120
good-names = nn, ii, _
good-names = nn, ii, _, jj
extension-pkg-whitelist=numpy,pyndl.ndl_parallel
ignore=pyndl/ndl_parallel
disable=E1101
Expand Down
Binary file modified tests/reference/event_file_bigrams_to_word.tab.gz
Binary file not shown.
Binary file modified tests/reference/event_file_trigrams_to_word.tab.gz
Binary file not shown.
Binary file modified tests/reference/event_file_trigrams_to_word_line_based.tab.gz
Binary file not shown.