diff --git a/pyndl/preprocess.py b/pyndl/preprocess.py index 2d48022..b530dfb 100644 --- a/pyndl/preprocess.py +++ b/pyndl/preprocess.py @@ -100,9 +100,9 @@ def ngrams_to_word(occurrences, n_chars, outfile, remove_duplicates=True): if not ngrams or not occurrence: continue if remove_duplicates: - outfile.write("{}\t{}\n".format("_".join(set(ngrams)), occurrence)) - else: - outfile.write("{}\t{}\n".format("_".join(ngrams), occurrence)) + ngrams = set(ngrams) + occurrence = "_".join(set(occurrence.split("_"))) + outfile.write("{}\t{}\n".format("_".join(ngrams), occurrence)) def process_occurrences(occurrences, outfile, *, @@ -245,19 +245,16 @@ def gen_occurrences(words): """ if event_structure == 'consecutive_words': occurrences = list() - cur_words = list() - ii = 0 - while True: - if ii < len(words): - cur_words.append(words[ii]) - if ii >= len(words) or ii >= number_of_words: - # remove the first word - cur_words = cur_words[1:] + # can't have more consecutive words than total words + length = min(number_of_words, len(words)) + # slide window over list of words + for ii in range(1 - length, len(words)): + # no consecutive words before first word + start = max(ii, 0) + # no consecutive words after last word + end = min(ii + length, len(words)) # append (cues, outcomes) with empty outcomes - occurrences.append(("_".join(cur_words), '')) - ii += 1 - if not cur_words: - break + occurrences.append(("_".join(words[start:end]), "")) return occurrences # for words = (A, B, C, D); before = 2, after = 1 # make: (B, A), (A_C, B), (A_B_D, C), (B_C, D) @@ -274,6 +271,8 @@ def gen_occurrences(words): elif event_structure == 'line': # (cues, outcomes) with empty outcomes return [('_'.join(words), ''), ] + else: + raise ValueError('gen_occurrences should be one of {"consecutive_words", "word_to_word", "line"}') def process_line(line): """processes one line of text.""" @@ -294,13 +293,6 @@ def process_words(words): cue_structure=cue_structure, remove_duplicates=remove_duplicates) - def process_context(line): - """called when a context boundary is found.""" - if context_structure == 'document': - # remove document marker - line = context_pattern.sub("", line) - return line - with open(corpus_file, "rt") as corpus: with gzip.open(event_file, "wt") as outfile: outfile.write("cues\toutcomes\n") @@ -319,29 +311,16 @@ def process_context(line): process_words(words) else: if context_pattern.search(line) is not None: - # process the first context - context1, *contexts = context_pattern.split(line) - context1 = process_context(context1) - - if context1.strip(): - context1 = process_line(context1.strip()) - words.extend(gen_words(context1)) - process_words(words) - # process in between contexts - while len(contexts) > 1: - words = [] - context1, *contexts = contexts - context1 = process_context(context1) - if context1.strip(): - context1 = process_line(context1.strip()) - words.extend(gen_words(context1)) + contexts = context_pattern.split(line) + + # process contexts; only extend words on last context + for jj, context in enumerate(contexts): + context = process_line(context.strip()) + words.extend(gen_words(context)) + if jj < len(contexts): process_words(words) - # add last part to next context - context1 = contexts[0] - context1 = process_context(context1) - if context1.strip(): - context1 = process_line(context1.strip()) - words.extend(gen_words(context1)) + words = [] + else: line = process_line(line) words.extend(gen_words(line)) diff --git a/setup.cfg b/setup.cfg index dafbe18..b061ab0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ max-line-length = 120 [pylint] max-line-length = 120 -good-names = nn, ii, _ +good-names = nn, ii, _, jj extension-pkg-whitelist=numpy,pyndl.ndl_parallel ignore=pyndl/ndl_parallel disable=E1101 diff --git a/tests/reference/event_file_bigrams_to_word.tab.gz b/tests/reference/event_file_bigrams_to_word.tab.gz index f3b1e4f..4185692 100644 Binary files a/tests/reference/event_file_bigrams_to_word.tab.gz and b/tests/reference/event_file_bigrams_to_word.tab.gz differ diff --git a/tests/reference/event_file_trigrams_to_word.tab.gz b/tests/reference/event_file_trigrams_to_word.tab.gz index c5708b2..14b14af 100644 Binary files a/tests/reference/event_file_trigrams_to_word.tab.gz and b/tests/reference/event_file_trigrams_to_word.tab.gz differ diff --git a/tests/reference/event_file_trigrams_to_word_line_based.tab.gz b/tests/reference/event_file_trigrams_to_word_line_based.tab.gz index 2ef4480..9e78696 100644 Binary files a/tests/reference/event_file_trigrams_to_word_line_based.tab.gz and b/tests/reference/event_file_trigrams_to_word_line_based.tab.gz differ