Earley now uses OrderedSet for better output stability

lark-parser · Aug 23, 2023 · 40228c6 · 40228c6
1 parent 656334c
commit 40228c6
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 17 deletions.
diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py
@@ -16,7 +16,7 @@
 from ..lexer import Token
 from ..tree import Tree
 from ..exceptions import UnexpectedEOF, UnexpectedToken
-from ..utils import logger
+from ..utils import logger, OrderedSet
 from .grammar_analysis import GrammarAnalyzer
 from ..grammar import NonTerminal
 from .earley_common import Item
@@ -190,13 +190,13 @@ def scan(i, token, to_scan):
             Earley predictor, based on the previously completed tokens.
             This ensures that at each phase of the parse we have a custom
             lexer context, allowing for more complex ambiguities."""
-            next_to_scan = set()
-            next_set = set()
+            next_to_scan = OrderedSet()
+            next_set = OrderedSet()
             columns.append(next_set)
             transitives.append({})
             node_cache = {}
 
-            for item in set(to_scan):
+            for item in OrderedSet(to_scan):
                 if match(item.expect, token):
                     new_item = item.advance()
                     label = (new_item.s, new_item.start, i)
@@ -260,8 +260,8 @@ def parse(self, lexer, start):
         assert start, start
         start_symbol = NonTerminal(start)
 
-        columns = [set()]
-        to_scan = set()     # The scan buffer. 'Q' in E.Scott's paper.
+        columns = [OrderedSet()]
+        to_scan = OrderedSet()     # The scan buffer. 'Q' in E.Scott's paper.
 
         ## Predict for the start_symbol.
         # Add predicted items to the first Earley set (for the predictor) if they

diff --git a/lark/parsers/earley_forest.py b/lark/parsers/earley_forest.py
@@ -15,7 +15,7 @@
 
 from ..parse_tree_builder import AmbiguousIntermediateExpander
 from ..visitors import Discard
-from ..utils import logger
+from ..utils import logger, OrderedSet
 from ..tree import Tree
 
 class ForestNode:
@@ -49,8 +49,8 @@ def __init__(self, s, start, end):
         self.s = s
         self.start = start
         self.end = end
-        self._children = set()
-        self.paths = set()
+        self._children = OrderedSet()
+        self.paths = OrderedSet()
         self.paths_loaded = False
 
         ### We use inf here as it can be safely negated without resorting to conditionals,
@@ -280,10 +280,10 @@ def visit(self, root):
         # of a symbol/intermediate so that we can process both up and down. Also,
         # since the SPPF can have cycles it allows us to detect if we're trying
         # to recurse into a node that's already on the stack (infinite recursion).
-        visiting = set()
+        visiting = OrderedSet()
 
         # set of all nodes that have been visited
-        visited = set()
+        visited = OrderedSet()
 
         # a list of nodes that are currently being visited
         # used for the `on_cycle` callback

diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py
@@ -1,4 +1,4 @@
-"""This module implements an experimental Earley parser with a dynamic lexer
+"""This module implements an Earley parser with a dynamic lexer
 
 The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:
     https://www.sciencedirect.com/science/article/pii/S1571066108001497
@@ -22,7 +22,7 @@
 from ..grammar import Terminal
 from .earley import Parser as BaseParser
 from .earley_forest import SymbolNode, TokenNode
-
+from ..utils import OrderedSet
 
 class Parser(BaseParser):
     def __init__(self, lexer_conf, parser_conf, term_matcher, resolve_ambiguity=True, complete_lex = False, debug=False, tree_class=Tree):
@@ -49,7 +49,7 @@ def scan(i, to_scan):
             # they complete, we push all tokens into a buffer (delayed_matches), to
             # be held possibly for a later parse step when we reach the point in the
             # input stream at which they complete.
-            for item in set(to_scan):
+            for item in OrderedSet(to_scan):
                 m = match(item.expect, stream, i)
                 if m:
                     t = Token(item.expect.name, m.group(0), i, text_line, text_column)
@@ -81,8 +81,8 @@ def scan(i, to_scan):
                     # If we're ignoring up to the end of the file, # carry over the start symbol if it already completed.
                     delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol])
 
-            next_to_scan = set()
-            next_set = set()
+            next_to_scan = OrderedSet()
+            next_set = OrderedSet()
             columns.append(next_set)
             transitives.append({})
 

diff --git a/lark/utils.py b/lark/utils.py
@@ -2,7 +2,7 @@
 import os
 from itertools import product
 from collections import deque
-from typing import Callable, Iterator, List, Optional, Tuple, Type, TypeVar, Union, Dict, Any, Sequence, Iterable
+from typing import Callable, Iterator, List, Optional, Tuple, Type, TypeVar, Union, Dict, Any, Sequence, Iterable, Generic
 
 ###{standalone
 import sys, re
@@ -328,3 +328,27 @@ def small_factors(n: int, max_factor: int) -> List[Tuple[int, int]]:
         if a + b <= max_factor:
             return small_factors(r, max_factor) + [(a, b)]
     assert False, "Failed to factorize %s" % n
+
+
+class OrderedSet(Generic[T]):
+    """A minimal OrderedSet implementation, using a dictionary.
+
+    (relies on the dictionary being ordered)
+    """
+    def __init__(self, items: Iterable[T] =()):
+        self.d = dict.fromkeys(items)
+
+    def __contains__(self, item: T) -> bool:
+        return item in self.d
+
+    def add(self, item: T):
+        self.d[item] = None
+
+    def __iter__(self) -> Iterator[T]:
+        return iter(self.d)
+
+    def remove(self, item: T):
+        del self.d[item]
+
+    def __bool__(self):
+        return bool(self.d)