Skip to content

Commit

Permalink
Earley now uses OrderedSet for better output stability
Browse files Browse the repository at this point in the history
  • Loading branch information
erezsh committed Aug 23, 2023
1 parent 656334c commit 40228c6
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 17 deletions.
12 changes: 6 additions & 6 deletions lark/parsers/earley.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from ..lexer import Token
from ..tree import Tree
from ..exceptions import UnexpectedEOF, UnexpectedToken
from ..utils import logger
from ..utils import logger, OrderedSet
from .grammar_analysis import GrammarAnalyzer
from ..grammar import NonTerminal
from .earley_common import Item
Expand Down Expand Up @@ -190,13 +190,13 @@ def scan(i, token, to_scan):
Earley predictor, based on the previously completed tokens.
This ensures that at each phase of the parse we have a custom
lexer context, allowing for more complex ambiguities."""
next_to_scan = set()
next_set = set()
next_to_scan = OrderedSet()
next_set = OrderedSet()
columns.append(next_set)
transitives.append({})
node_cache = {}

for item in set(to_scan):
for item in OrderedSet(to_scan):
if match(item.expect, token):
new_item = item.advance()
label = (new_item.s, new_item.start, i)
Expand Down Expand Up @@ -260,8 +260,8 @@ def parse(self, lexer, start):
assert start, start
start_symbol = NonTerminal(start)

columns = [set()]
to_scan = set() # The scan buffer. 'Q' in E.Scott's paper.
columns = [OrderedSet()]
to_scan = OrderedSet() # The scan buffer. 'Q' in E.Scott's paper.

## Predict for the start_symbol.
# Add predicted items to the first Earley set (for the predictor) if they
Expand Down
10 changes: 5 additions & 5 deletions lark/parsers/earley_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from ..parse_tree_builder import AmbiguousIntermediateExpander
from ..visitors import Discard
from ..utils import logger
from ..utils import logger, OrderedSet
from ..tree import Tree

class ForestNode:
Expand Down Expand Up @@ -49,8 +49,8 @@ def __init__(self, s, start, end):
self.s = s
self.start = start
self.end = end
self._children = set()
self.paths = set()
self._children = OrderedSet()
self.paths = OrderedSet()
self.paths_loaded = False

### We use inf here as it can be safely negated without resorting to conditionals,
Expand Down Expand Up @@ -280,10 +280,10 @@ def visit(self, root):
# of a symbol/intermediate so that we can process both up and down. Also,
# since the SPPF can have cycles it allows us to detect if we're trying
# to recurse into a node that's already on the stack (infinite recursion).
visiting = set()
visiting = OrderedSet()

# set of all nodes that have been visited
visited = set()
visited = OrderedSet()

# a list of nodes that are currently being visited
# used for the `on_cycle` callback
Expand Down
10 changes: 5 additions & 5 deletions lark/parsers/xearley.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""This module implements an experimental Earley parser with a dynamic lexer
"""This module implements an Earley parser with a dynamic lexer
The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:
https://www.sciencedirect.com/science/article/pii/S1571066108001497
Expand All @@ -22,7 +22,7 @@
from ..grammar import Terminal
from .earley import Parser as BaseParser
from .earley_forest import SymbolNode, TokenNode

from ..utils import OrderedSet

class Parser(BaseParser):
def __init__(self, lexer_conf, parser_conf, term_matcher, resolve_ambiguity=True, complete_lex = False, debug=False, tree_class=Tree):
Expand All @@ -49,7 +49,7 @@ def scan(i, to_scan):
# they complete, we push all tokens into a buffer (delayed_matches), to
# be held possibly for a later parse step when we reach the point in the
# input stream at which they complete.
for item in set(to_scan):
for item in OrderedSet(to_scan):
m = match(item.expect, stream, i)
if m:
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
Expand Down Expand Up @@ -81,8 +81,8 @@ def scan(i, to_scan):
# If we're ignoring up to the end of the file, # carry over the start symbol if it already completed.
delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol])

next_to_scan = set()
next_set = set()
next_to_scan = OrderedSet()
next_set = OrderedSet()
columns.append(next_set)
transitives.append({})

Expand Down
26 changes: 25 additions & 1 deletion lark/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
from itertools import product
from collections import deque
from typing import Callable, Iterator, List, Optional, Tuple, Type, TypeVar, Union, Dict, Any, Sequence, Iterable
from typing import Callable, Iterator, List, Optional, Tuple, Type, TypeVar, Union, Dict, Any, Sequence, Iterable, Generic

###{standalone
import sys, re
Expand Down Expand Up @@ -328,3 +328,27 @@ def small_factors(n: int, max_factor: int) -> List[Tuple[int, int]]:
if a + b <= max_factor:
return small_factors(r, max_factor) + [(a, b)]
assert False, "Failed to factorize %s" % n


class OrderedSet(Generic[T]):
"""A minimal OrderedSet implementation, using a dictionary.
(relies on the dictionary being ordered)
"""
def __init__(self, items: Iterable[T] =()):
self.d = dict.fromkeys(items)

def __contains__(self, item: T) -> bool:
return item in self.d

def add(self, item: T):
self.d[item] = None

def __iter__(self) -> Iterator[T]:
return iter(self.d)

def remove(self, item: T):
del self.d[item]

def __bool__(self):
return bool(self.d)

0 comments on commit 40228c6

Please sign in to comment.