diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index a6229f888..9e09c438a 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -16,7 +16,7 @@ from ..lexer import Token from ..tree import Tree from ..exceptions import UnexpectedEOF, UnexpectedToken -from ..utils import logger +from ..utils import logger, OrderedSet from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal from .earley_common import Item @@ -190,13 +190,13 @@ def scan(i, token, to_scan): Earley predictor, based on the previously completed tokens. This ensures that at each phase of the parse we have a custom lexer context, allowing for more complex ambiguities.""" - next_to_scan = set() - next_set = set() + next_to_scan = OrderedSet() + next_set = OrderedSet() columns.append(next_set) transitives.append({}) node_cache = {} - for item in set(to_scan): + for item in OrderedSet(to_scan): if match(item.expect, token): new_item = item.advance() label = (new_item.s, new_item.start, i) @@ -260,8 +260,8 @@ def parse(self, lexer, start): assert start, start start_symbol = NonTerminal(start) - columns = [set()] - to_scan = set() # The scan buffer. 'Q' in E.Scott's paper. + columns = [OrderedSet()] + to_scan = OrderedSet() # The scan buffer. 'Q' in E.Scott's paper. ## Predict for the start_symbol. # Add predicted items to the first Earley set (for the predictor) if they diff --git a/lark/parsers/earley_forest.py b/lark/parsers/earley_forest.py index 6763af619..0a692e016 100644 --- a/lark/parsers/earley_forest.py +++ b/lark/parsers/earley_forest.py @@ -15,7 +15,7 @@ from ..parse_tree_builder import AmbiguousIntermediateExpander from ..visitors import Discard -from ..utils import logger +from ..utils import logger, OrderedSet from ..tree import Tree class ForestNode: @@ -49,8 +49,8 @@ def __init__(self, s, start, end): self.s = s self.start = start self.end = end - self._children = set() - self.paths = set() + self._children = OrderedSet() + self.paths = OrderedSet() self.paths_loaded = False ### We use inf here as it can be safely negated without resorting to conditionals, @@ -280,10 +280,10 @@ def visit(self, root): # of a symbol/intermediate so that we can process both up and down. Also, # since the SPPF can have cycles it allows us to detect if we're trying # to recurse into a node that's already on the stack (infinite recursion). - visiting = set() + visiting = OrderedSet() # set of all nodes that have been visited - visited = set() + visited = OrderedSet() # a list of nodes that are currently being visited # used for the `on_cycle` callback diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 343e5c0b6..944be18a7 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -1,4 +1,4 @@ -"""This module implements an experimental Earley parser with a dynamic lexer +"""This module implements an Earley parser with a dynamic lexer The core Earley algorithm used here is based on Elizabeth Scott's implementation, here: https://www.sciencedirect.com/science/article/pii/S1571066108001497 @@ -22,7 +22,7 @@ from ..grammar import Terminal from .earley import Parser as BaseParser from .earley_forest import SymbolNode, TokenNode - +from ..utils import OrderedSet class Parser(BaseParser): def __init__(self, lexer_conf, parser_conf, term_matcher, resolve_ambiguity=True, complete_lex = False, debug=False, tree_class=Tree): @@ -49,7 +49,7 @@ def scan(i, to_scan): # they complete, we push all tokens into a buffer (delayed_matches), to # be held possibly for a later parse step when we reach the point in the # input stream at which they complete. - for item in set(to_scan): + for item in OrderedSet(to_scan): m = match(item.expect, stream, i) if m: t = Token(item.expect.name, m.group(0), i, text_line, text_column) @@ -81,8 +81,8 @@ def scan(i, to_scan): # If we're ignoring up to the end of the file, # carry over the start symbol if it already completed. delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol]) - next_to_scan = set() - next_set = set() + next_to_scan = OrderedSet() + next_set = OrderedSet() columns.append(next_set) transitives.append({}) diff --git a/lark/utils.py b/lark/utils.py index b47096f2c..aa145b152 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -2,7 +2,7 @@ import os from itertools import product from collections import deque -from typing import Callable, Iterator, List, Optional, Tuple, Type, TypeVar, Union, Dict, Any, Sequence, Iterable +from typing import Callable, Iterator, List, Optional, Tuple, Type, TypeVar, Union, Dict, Any, Sequence, Iterable, Generic ###{standalone import sys, re @@ -328,3 +328,27 @@ def small_factors(n: int, max_factor: int) -> List[Tuple[int, int]]: if a + b <= max_factor: return small_factors(r, max_factor) + [(a, b)] assert False, "Failed to factorize %s" % n + + +class OrderedSet(Generic[T]): + """A minimal OrderedSet implementation, using a dictionary. + + (relies on the dictionary being ordered) + """ + def __init__(self, items: Iterable[T] =()): + self.d = dict.fromkeys(items) + + def __contains__(self, item: T) -> bool: + return item in self.d + + def add(self, item: T): + self.d[item] = None + + def __iter__(self) -> Iterator[T]: + return iter(self.d) + + def remove(self, item: T): + del self.d[item] + + def __bool__(self): + return bool(self.d)