Browse Source

Merge branch 'master' of https://github.com/erezsh/lark

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Kevin Latimer 6 years ago
parent
commit
9426010b70
11 changed files with 96 additions and 62 deletions
  1. +1
    -0
      lark/__init__.py
  2. +1
    -0
      lark/lark.py
  3. +8
    -0
      lark/lexer.py
  4. +3
    -3
      lark/load_grammar.py
  5. +1
    -0
      lark/parser_frontends.py
  6. +6
    -4
      lark/parsers/earley.py
  7. +7
    -3
      lark/parsers/lalr_parser.py
  8. +35
    -42
      lark/parsers/resolve_ambig.py
  9. +6
    -4
      lark/parsers/xearley.py
  10. +15
    -6
      lark/tree.py
  11. +13
    -0
      tests/test_parser.py

+ 1
- 0
lark/__init__.py View File

@@ -1,5 +1,6 @@
from .tree import Tree, Transformer, InlineTransformer from .tree import Tree, Transformer, InlineTransformer
from .common import ParseError, GrammarError from .common import ParseError, GrammarError
from .lexer import UnexpectedInput, LexError
from .lark import Lark from .lark import Lark
from .utils import inline_args from .utils import inline_args




+ 1
- 0
lark/lark.py View File

@@ -57,6 +57,7 @@ class LarkOptions(object):
self.profile = o.pop('profile', False) self.profile = o.pop('profile', False)
self.ambiguity = o.pop('ambiguity', 'auto') self.ambiguity = o.pop('ambiguity', 'auto')
self.propagate_positions = o.pop('propagate_positions', False) self.propagate_positions = o.pop('propagate_positions', False)
self.earley__predict_all = o.pop('earley__predict_all', False)


assert self.parser in ('earley', 'lalr', None) assert self.parser in ('earley', 'lalr', None)




+ 8
- 0
lark/lexer.py View File

@@ -40,6 +40,14 @@ class Token(Str):
def __deepcopy__(self, memo): def __deepcopy__(self, memo):
return Token(self.type, self.value, self.pos_in_stream, self.line, self.column) return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)


def __eq__(self, other):
if isinstance(other, Token) and self.type != other.type:
return False
return Str.__eq__(self, other)

__hash__ = Str.__hash__

class Regex: class Regex:
def __init__(self, pattern, flags=()): def __init__(self, pattern, flags=()):
self.pattern = pattern self.pattern = pattern


+ 3
- 3
lark/load_grammar.py View File

@@ -293,7 +293,6 @@ def _rfind(s, choices):




def _fix_escaping(s): def _fix_escaping(s):
s = s.replace('\\"', '"').replace("'", "\\'")
w = '' w = ''
i = iter(s) i = iter(s)
for n in i: for n in i:
@@ -305,6 +304,7 @@ def _fix_escaping(s):
elif n2 not in 'unftr': elif n2 not in 'unftr':
w += '\\' w += '\\'
w += n2 w += n2
w = w.replace('\\"', '"').replace("'", "\\'")


to_eval = "u'''%s'''" % w to_eval = "u'''%s'''" % w
try: try:
@@ -435,9 +435,9 @@ class Grammar:


for name, (tree, priority) in term_defs: # TODO transfer priority to rule? for name, (tree, priority) in term_defs: # TODO transfer priority to rule?
if name.startswith('_'): if name.startswith('_'):
options = RuleOptions(filter_out=True, priority=priority)
options = RuleOptions(filter_out=True, priority=-priority)
else: else:
options = RuleOptions(keep_all_tokens=True, create_token=name, priority=priority)
options = RuleOptions(keep_all_tokens=True, create_token=name, priority=-priority)


name = new_terminal_names[name] name = new_terminal_names[name]
inner_name = name + '_inner' inner_name = name + '_inner'


+ 1
- 0
lark/parser_frontends.py View File

@@ -126,6 +126,7 @@ class XEarley:
parser_conf.callback, parser_conf.callback,
resolve_ambiguity=get_ambiguity_resolver(options), resolve_ambiguity=get_ambiguity_resolver(options),
ignore=ignore, ignore=ignore,
predict_all=options.earley__predict_all
) )


def _prepare_expansion(self, expansion): def _prepare_expansion(self, expansion):


+ 6
- 4
lark/parsers/earley.py View File

@@ -90,7 +90,7 @@ class NewsList(list):


class Column: class Column:
"An entry in the table, aka Earley Chart. Contains lists of items." "An entry in the table, aka Earley Chart. Contains lists of items."
def __init__(self, i, FIRST):
def __init__(self, i, FIRST, predict_all=False):
self.i = i self.i = i
self.to_reduce = NewsList() self.to_reduce = NewsList()
self.to_predict = NewsList() self.to_predict = NewsList()
@@ -100,6 +100,7 @@ class Column:


self.predicted = set() self.predicted = set()
self.completed = {} self.completed = {}
self.predict_all = predict_all


def add(self, items): def add(self, items):
"""Sort items into scan/predict/reduce newslists """Sort items into scan/predict/reduce newslists
@@ -108,9 +109,9 @@ class Column:
""" """
for item in items: for item in items:


item_key = item, item.tree # Elsewhere, tree is not part of the comparison
if item.is_complete: if item.is_complete:
# XXX Potential bug: What happens if there's ambiguity in an empty rule? # XXX Potential bug: What happens if there's ambiguity in an empty rule?
item_key = item, item.tree # Elsewhere, tree is not part of the comparison
if item.rule.expansion and item_key in self.completed: if item.rule.expansion and item_key in self.completed:
old_tree = self.completed[item_key].tree old_tree = self.completed[item_key].tree
if old_tree == item.tree: if old_tree == item.tree:
@@ -137,9 +138,10 @@ class Column:
if isinstance(item.expect, Terminal): if isinstance(item.expect, Terminal):
self.to_scan.append(item) self.to_scan.append(item)
else: else:
if item in self.predicted:
k = item_key if self.predict_all else item
if k in self.predicted:
continue continue
self.predicted.add(item)
self.predicted.add(k)
self.to_predict.append(item) self.to_predict.append(item)


self.item_count += 1 # Only count if actually added self.item_count += 1 # Only count if actually added


+ 7
- 3
lark/parsers/lalr_parser.py View File

@@ -7,6 +7,10 @@ from ..common import ParseError, UnexpectedToken


from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT


class FinalReduce:
def __init__(self, value):
self.value = value

class Parser: class Parser:
def __init__(self, parser_conf): def __init__(self, parser_conf):
assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization" assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization"
@@ -56,7 +60,7 @@ class _Parser:
res = self.callbacks[rule](s) res = self.callbacks[rule](s)


if end and len(state_stack) == 1 and rule.origin == self.start_symbol: if end and len(state_stack) == 1 and rule.origin == self.start_symbol:
return res
return FinalReduce(res)


_action, new_state = get_action(rule.origin) _action, new_state = get_action(rule.origin)
assert _action == ACTION_SHIFT assert _action == ACTION_SHIFT
@@ -85,9 +89,9 @@ class _Parser:
_action, rule = get_action('$end') _action, rule = get_action('$end')
assert _action == 'reduce' assert _action == 'reduce'
res = reduce(*rule, end=True) res = reduce(*rule, end=True)
if res:
if isinstance(res, FinalReduce):
assert state_stack == [self.init_state] and not value_stack, len(state_stack) assert state_stack == [self.init_state] and not value_stack, len(state_stack)
return res
return res.value







+ 35
- 42
lark/parsers/resolve_ambig.py View File

@@ -9,56 +9,60 @@ from ..tree import Tree, Visitor_NoRecurse
# Author: Erez Sh # Author: Erez Sh


def _compare_rules(rule1, rule2): def _compare_rules(rule1, rule2):
if rule1.origin != rule2.origin:
if rule1.options and rule2.options:
if rule1.options.priority is not None and rule2.options.priority is not None:
assert rule1.options.priority != rule2.options.priority, "Priority is the same between both rules: %s == %s" % (rule1, rule2)
return -compare(rule1.options.priority, rule2.options.priority)

return 0

c = compare( len(rule1.expansion), len(rule2.expansion))
if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here
c = -compare( len(rule1.expansion), len(rule2.expansion))
if rule1.origin.startswith('__'): # XXX hack! We should set priority in parser, not here
c = -c c = -c
return c return c


def _compare_drv(tree1, tree2):
if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)):

def _sum_priority(tree):
p = 0

for n in tree.iter_subtrees():
try: try:
return -compare(tree1, tree2)
except TypeError:
return 0
p += n.rule.options.priority or 0
except AttributeError:
pass

return p

def _compare_priority(tree1, tree2):
tree1.iter_subtrees()


def _compare_drv(tree1, tree2):
try: try:
rule1, rule2 = tree1.rule, tree2.rule rule1, rule2 = tree1.rule, tree2.rule
except AttributeError: except AttributeError:
# Probably trees that don't take part in this parse (better way to distinguish?)
return -compare(tree1, tree2)
# Probably non-trees, or user trees that weren't created by the parse (better way to distinguish?)
return compare(tree1, tree2)

assert tree1.data != '_ambig'
assert tree2.data != '_ambig'


# XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse,
# when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be
# computationally inefficient. So we handle it here.
if tree1.data == '_ambig':
_standard_resolve_ambig(tree1)
if tree2.data == '_ambig':
_standard_resolve_ambig(tree2)
p1 = _sum_priority(tree1)
p2 = _sum_priority(tree2)
c = (p1 or p2) and compare(p1, p2)
if c:
return c


c = _compare_rules(tree1.rule, tree2.rule) c = _compare_rules(tree1.rule, tree2.rule)
if c: if c:
return c return c


# rules are "equal", so compare trees # rules are "equal", so compare trees
for t1, t2 in zip(tree1.children, tree2.children):
c = _compare_drv(t1, t2)
if c:
return c
if len(tree1.children) == len(tree2.children):
for t1, t2 in zip(tree1.children, tree2.children):
c = _compare_drv(t1, t2)
if c:
return c


return compare(len(tree1.children), len(tree2.children)) return compare(len(tree1.children), len(tree2.children))




def _standard_resolve_ambig(tree): def _standard_resolve_ambig(tree):
assert tree.data == '_ambig' assert tree.data == '_ambig'
best = min(tree.children, key=cmp_to_key(_compare_drv))
key_f = cmp_to_key(_compare_drv)
best = max(tree.children, key=key_f)
assert best.data == 'drv' assert best.data == 'drv'
tree.set('drv', best.children) tree.set('drv', best.children)
tree.rule = best.rule # needed for applying callbacks tree.rule = best.rule # needed for applying callbacks
@@ -80,23 +84,12 @@ def _antiscore_sum_drv(tree):
if not isinstance(tree, Tree): if not isinstance(tree, Tree):
return 0 return 0


# XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse,
# when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be
# computationally inefficient. So we handle it here.
if tree.data == '_ambig':
_antiscore_sum_resolve_ambig(tree)
assert tree.data != '_ambig'


try:
priority = tree.rule.options.priority
except AttributeError:
# Probably trees that don't take part in this parse (better way to distinguish?)
priority = None

return (priority or 0) + sum(map(_antiscore_sum_drv, tree.children), 0)
return _sum_priority(tree)


def _antiscore_sum_resolve_ambig(tree): def _antiscore_sum_resolve_ambig(tree):
assert tree.data == '_ambig' assert tree.data == '_ambig'

best = min(tree.children, key=_antiscore_sum_drv) best = min(tree.children, key=_antiscore_sum_drv)
assert best.data == 'drv' assert best.data == 'drv'
tree.set('drv', best.children) tree.set('drv', best.children)


+ 6
- 4
lark/parsers/xearley.py View File

@@ -28,11 +28,12 @@ from .grammar_analysis import GrammarAnalyzer
from .earley import ApplyCallbacks, Item, Column from .earley import ApplyCallbacks, Item, Column


class Parser: class Parser:
def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=()):
def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=(), predict_all=False):
self.analysis = GrammarAnalyzer(rules, start_symbol) self.analysis = GrammarAnalyzer(rules, start_symbol)
self.start_symbol = start_symbol self.start_symbol = start_symbol
self.resolve_ambiguity = resolve_ambiguity self.resolve_ambiguity = resolve_ambiguity
self.ignore = list(ignore) self.ignore = list(ignore)
self.predict_all = predict_all




self.postprocess = {} self.postprocess = {}
@@ -107,9 +108,10 @@ class Parser:
for j in range(1, len(s)): for j in range(1, len(s)):
m = item.expect.match(s[:-j]) m = item.expect.match(s[:-j])
if m: if m:
delayed_matches[m.end()].append(item.advance(m.group(0)))
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
delayed_matches[i+m.end()].append(item.advance(t))


next_set = Column(i+1, self.FIRST)
next_set = Column(i+1, self.FIRST, predict_all=self.predict_all)
next_set.add(delayed_matches[i+1]) next_set.add(delayed_matches[i+1])
del delayed_matches[i+1] # No longer needed, so unburden memory del delayed_matches[i+1] # No longer needed, so unburden memory


@@ -119,7 +121,7 @@ class Parser:
return next_set return next_set


# Main loop starts # Main loop starts
column0 = Column(0, self.FIRST)
column0 = Column(0, self.FIRST, predict_all=self.predict_all)
column0.add(predict(start_symbol, column0)) column0.add(predict(start_symbol, column0))


column = column0 column = column0


+ 15
- 6
lark/tree.py View File

@@ -67,17 +67,26 @@ class Tree(object):
yield c yield c


def iter_subtrees(self): def iter_subtrees(self):
# TODO: Re-write as a more efficient version

visited = set() visited = set()
q = [self] q = [self]


l = []
while q: while q:
subtree = q.pop() subtree = q.pop()
l.append( subtree )
if id(subtree) in visited: if id(subtree) in visited:
continue # already been here from another branch continue # already been here from another branch
visited.add(id(subtree)) visited.add(id(subtree))
yield subtree
q += [c for c in subtree.children if isinstance(c, Tree)] q += [c for c in subtree.children if isinstance(c, Tree)]


seen = set()
for x in reversed(l):
if id(x) not in seen:
yield x
seen.add(id(x))



def __deepcopy__(self, memo): def __deepcopy__(self, memo):
return type(self)(self.data, deepcopy(self.children, memo)) return type(self)(self.data, deepcopy(self.children, memo))
@@ -100,7 +109,7 @@ class Transformer(object):
if isinstance(c, Tree): if isinstance(c, Tree):
try: try:
items.append(self.transform(c)) items.append(self.transform(c))
except Erase:
except Discard:
pass pass
try: try:
f = self._get_func(tree.data) f = self._get_func(tree.data)
@@ -116,7 +125,7 @@ class Transformer(object):
return TransformerChain(self, other) return TransformerChain(self, other)




class Erase(Exception):
class Discard(Exception):
pass pass


class TransformerChain(object): class TransformerChain(object):
@@ -156,7 +165,7 @@ class Visitor_NoRecurse(Visitor):
def visit(self, tree): def visit(self, tree):
subtrees = list(tree.iter_subtrees()) subtrees = list(tree.iter_subtrees())


for subtree in reversed(subtrees):
for subtree in (subtrees):
getattr(self, subtree.data, self.__default__)(subtree) getattr(self, subtree.data, self.__default__)(subtree)
return tree return tree


@@ -174,13 +183,13 @@ class Transformer_NoRecurse(Transformer):
else: else:
return f(t) return f(t)


for subtree in reversed(subtrees):
for subtree in (subtrees):
children = [] children = []
for c in subtree.children: for c in subtree.children:
if isinstance(c, Tree): if isinstance(c, Tree):
try: try:
children.append(_t(c)) children.append(_t(c))
except Erase:
except Discard:
pass pass
else: else:
children.append(c) children.append(c)


+ 13
- 0
tests/test_parser.py View File

@@ -711,6 +711,19 @@ def _make_parser_test(LEXER, PARSER):
""") """)
x = g.parse('AB') x = g.parse('AB')


@unittest.skipIf(LEXER == None, "Scanless can't handle regexps")
def test_regex_quote(self):
g = r"""
start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING
SINGLE_QUOTED_STRING : /'[^']*'/
DOUBLE_QUOTED_STRING : /"[^"]*"/
"""

g = _Lark(g)
self.assertEqual( g.parse('"hello"').children, ['"hello"'])
self.assertEqual( g.parse("'hello'").children, ["'hello'"])


def test_lexer_token_limit(self): def test_lexer_token_limit(self):
"Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation" "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
tokens = {'A%d'%i:'"%d"'%i for i in range(300)} tokens = {'A%d'%i:'"%d"'%i for i in range(300)}


Loading…
Cancel
Save