quantling · derNarr · Jan 31, 2018 · Feb 7, 2018 · Feb 10, 2018 · Feb 10, 2018
diff --git a/pyndl/preprocess.py b/pyndl/preprocess.py
@@ -100,9 +100,9 @@ def ngrams_to_word(occurrences, n_chars, outfile, remove_duplicates=True):
         if not ngrams or not occurrence:
             continue
         if remove_duplicates:
-            outfile.write("{}\t{}\n".format("_".join(set(ngrams)), occurrence))
-        else:
-            outfile.write("{}\t{}\n".format("_".join(ngrams), occurrence))
+            ngrams = set(ngrams)
+            occurrence = "_".join(set(occurrence.split("_")))
+        outfile.write("{}\t{}\n".format("_".join(ngrams), occurrence))
 
 
 def process_occurrences(occurrences, outfile, *,
@@ -245,19 +245,16 @@ def gen_occurrences(words):
         """
         if event_structure == 'consecutive_words':
             occurrences = list()
-            cur_words = list()
-            ii = 0
-            while True:
-                if ii < len(words):
-                    cur_words.append(words[ii])
-                if ii >= len(words) or ii >= number_of_words:
-                    # remove the first word
-                    cur_words = cur_words[1:]
+            # can't have more consecutive words than total words
+            length = min(number_of_words, len(words))
+            # slide window over list of words
+            for ii in range(1 - length, len(words)):
+                # no consecutive words before first word
+                start = max(ii, 0)
+                # no consecutive words after last word
+                end = min(ii + length, len(words))
                 # append (cues, outcomes) with empty outcomes
-                occurrences.append(("_".join(cur_words), ''))
-                ii += 1
-                if not cur_words:
-                    break
+                occurrences.append(("_".join(words[start:end]), ""))
             return occurrences
         # for words = (A, B, C, D); before = 2, after = 1
         # make: (B, A), (A_C, B), (A_B_D, C), (B_C, D)
@@ -274,6 +271,8 @@ def gen_occurrences(words):
         elif event_structure == 'line':
             # (cues, outcomes) with empty outcomes
             return [('_'.join(words), ''), ]
+        else:
+            raise ValueError('gen_occurrences should be one of {"consecutive_words", "word_to_word", "line"}')
 
     def process_line(line):
         """processes one line of text."""
@@ -294,13 +293,6 @@ def process_words(words):
                             cue_structure=cue_structure,
                             remove_duplicates=remove_duplicates)
 
-    def process_context(line):
-        """called when a context boundary is found."""
-        if context_structure == 'document':
-            # remove document marker
-            line = context_pattern.sub("", line)
-        return line
-
     with open(corpus_file, "rt") as corpus:
         with gzip.open(event_file, "wt") as outfile:
             outfile.write("cues\toutcomes\n")
@@ -319,29 +311,16 @@ def process_context(line):
                     process_words(words)
                 else:
                     if context_pattern.search(line) is not None:
-                        # process the first context
-                        context1, *contexts = context_pattern.split(line)
-                        context1 = process_context(context1)
-
-                        if context1.strip():
-                            context1 = process_line(context1.strip())
-                            words.extend(gen_words(context1))
-                        process_words(words)
-                        # process in between contexts
-                        while len(contexts) > 1:
-                            words = []
-                            context1, *contexts = contexts
-                            context1 = process_context(context1)
-                            if context1.strip():
-                                context1 = process_line(context1.strip())
-                                words.extend(gen_words(context1))
+                        contexts = context_pattern.split(line)
+
+                        # process contexts; only extend words on last context
+                        for jj, context in enumerate(contexts):
+                            context = process_line(context.strip())
+                            words.extend(gen_words(context))
+                            if jj < len(contexts):
                                 process_words(words)
-                        # add last part to next context
-                        context1 = contexts[0]
-                        context1 = process_context(context1)
-                        if context1.strip():
-                            context1 = process_line(context1.strip())
-                            words.extend(gen_words(context1))
+                                words = []
+
                     else:
                         line = process_line(line)
                         words.extend(gen_words(line))

diff --git a/setup.cfg b/setup.cfg
@@ -4,7 +4,7 @@ max-line-length = 120
 
 [pylint]
 max-line-length = 120
-good-names = nn, ii, _
+good-names = nn, ii, _, jj
 extension-pkg-whitelist=numpy,pyndl.ndl_parallel
 ignore=pyndl/ndl_parallel
 disable=E1101

diff --git a/tests/reference/event_file_bigrams_to_word.tab.gz b/tests/reference/event_file_bigrams_to_word.tab.gz
diff --git a/tests/reference/event_file_trigrams_to_word.tab.gz b/tests/reference/event_file_trigrams_to_word.tab.gz
diff --git a/tests/reference/event_file_trigrams_to_word_line_based.tab.gz b/tests/reference/event_file_trigrams_to_word_line_based.tab.gz