Selaa lähdekoodia

Merge branch 'master' of https://github.com/erezsh/lark

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Kevin Latimer 6 vuotta sitten
vanhempi
commit
9426010b70
11 muutettua tiedostoa jossa 96 lisäystä ja 62 poistoa
  1. +1
    -0
      lark/__init__.py
  2. +1
    -0
      lark/lark.py
  3. +8
    -0
      lark/lexer.py
  4. +3
    -3
      lark/load_grammar.py
  5. +1
    -0
      lark/parser_frontends.py
  6. +6
    -4
      lark/parsers/earley.py
  7. +7
    -3
      lark/parsers/lalr_parser.py
  8. +35
    -42
      lark/parsers/resolve_ambig.py
  9. +6
    -4
      lark/parsers/xearley.py
  10. +15
    -6
      lark/tree.py
  11. +13
    -0
      tests/test_parser.py

+ 1
- 0
lark/__init__.py Näytä tiedosto

@@ -1,5 +1,6 @@
from .tree import Tree, Transformer, InlineTransformer
from .common import ParseError, GrammarError
from .lexer import UnexpectedInput, LexError
from .lark import Lark
from .utils import inline_args



+ 1
- 0
lark/lark.py Näytä tiedosto

@@ -57,6 +57,7 @@ class LarkOptions(object):
self.profile = o.pop('profile', False)
self.ambiguity = o.pop('ambiguity', 'auto')
self.propagate_positions = o.pop('propagate_positions', False)
self.earley__predict_all = o.pop('earley__predict_all', False)

assert self.parser in ('earley', 'lalr', None)



+ 8
- 0
lark/lexer.py Näytä tiedosto

@@ -40,6 +40,14 @@ class Token(Str):
def __deepcopy__(self, memo):
return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)

def __eq__(self, other):
if isinstance(other, Token) and self.type != other.type:
return False
return Str.__eq__(self, other)

__hash__ = Str.__hash__

class Regex:
def __init__(self, pattern, flags=()):
self.pattern = pattern


+ 3
- 3
lark/load_grammar.py Näytä tiedosto

@@ -293,7 +293,6 @@ def _rfind(s, choices):


def _fix_escaping(s):
s = s.replace('\\"', '"').replace("'", "\\'")
w = ''
i = iter(s)
for n in i:
@@ -305,6 +304,7 @@ def _fix_escaping(s):
elif n2 not in 'unftr':
w += '\\'
w += n2
w = w.replace('\\"', '"').replace("'", "\\'")

to_eval = "u'''%s'''" % w
try:
@@ -435,9 +435,9 @@ class Grammar:

for name, (tree, priority) in term_defs: # TODO transfer priority to rule?
if name.startswith('_'):
options = RuleOptions(filter_out=True, priority=priority)
options = RuleOptions(filter_out=True, priority=-priority)
else:
options = RuleOptions(keep_all_tokens=True, create_token=name, priority=priority)
options = RuleOptions(keep_all_tokens=True, create_token=name, priority=-priority)

name = new_terminal_names[name]
inner_name = name + '_inner'


+ 1
- 0
lark/parser_frontends.py Näytä tiedosto

@@ -126,6 +126,7 @@ class XEarley:
parser_conf.callback,
resolve_ambiguity=get_ambiguity_resolver(options),
ignore=ignore,
predict_all=options.earley__predict_all
)

def _prepare_expansion(self, expansion):


+ 6
- 4
lark/parsers/earley.py Näytä tiedosto

@@ -90,7 +90,7 @@ class NewsList(list):

class Column:
"An entry in the table, aka Earley Chart. Contains lists of items."
def __init__(self, i, FIRST):
def __init__(self, i, FIRST, predict_all=False):
self.i = i
self.to_reduce = NewsList()
self.to_predict = NewsList()
@@ -100,6 +100,7 @@ class Column:

self.predicted = set()
self.completed = {}
self.predict_all = predict_all

def add(self, items):
"""Sort items into scan/predict/reduce newslists
@@ -108,9 +109,9 @@ class Column:
"""
for item in items:

item_key = item, item.tree # Elsewhere, tree is not part of the comparison
if item.is_complete:
# XXX Potential bug: What happens if there's ambiguity in an empty rule?
item_key = item, item.tree # Elsewhere, tree is not part of the comparison
if item.rule.expansion and item_key in self.completed:
old_tree = self.completed[item_key].tree
if old_tree == item.tree:
@@ -137,9 +138,10 @@ class Column:
if isinstance(item.expect, Terminal):
self.to_scan.append(item)
else:
if item in self.predicted:
k = item_key if self.predict_all else item
if k in self.predicted:
continue
self.predicted.add(item)
self.predicted.add(k)
self.to_predict.append(item)

self.item_count += 1 # Only count if actually added


+ 7
- 3
lark/parsers/lalr_parser.py Näytä tiedosto

@@ -7,6 +7,10 @@ from ..common import ParseError, UnexpectedToken

from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT

class FinalReduce:
def __init__(self, value):
self.value = value

class Parser:
def __init__(self, parser_conf):
assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization"
@@ -56,7 +60,7 @@ class _Parser:
res = self.callbacks[rule](s)

if end and len(state_stack) == 1 and rule.origin == self.start_symbol:
return res
return FinalReduce(res)

_action, new_state = get_action(rule.origin)
assert _action == ACTION_SHIFT
@@ -85,9 +89,9 @@ class _Parser:
_action, rule = get_action('$end')
assert _action == 'reduce'
res = reduce(*rule, end=True)
if res:
if isinstance(res, FinalReduce):
assert state_stack == [self.init_state] and not value_stack, len(state_stack)
return res
return res.value




+ 35
- 42
lark/parsers/resolve_ambig.py Näytä tiedosto

@@ -9,56 +9,60 @@ from ..tree import Tree, Visitor_NoRecurse
# Author: Erez Sh

def _compare_rules(rule1, rule2):
if rule1.origin != rule2.origin:
if rule1.options and rule2.options:
if rule1.options.priority is not None and rule2.options.priority is not None:
assert rule1.options.priority != rule2.options.priority, "Priority is the same between both rules: %s == %s" % (rule1, rule2)
return -compare(rule1.options.priority, rule2.options.priority)

return 0

c = compare( len(rule1.expansion), len(rule2.expansion))
if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here
c = -compare( len(rule1.expansion), len(rule2.expansion))
if rule1.origin.startswith('__'): # XXX hack! We should set priority in parser, not here
c = -c
return c

def _compare_drv(tree1, tree2):
if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)):

def _sum_priority(tree):
p = 0

for n in tree.iter_subtrees():
try:
return -compare(tree1, tree2)
except TypeError:
return 0
p += n.rule.options.priority or 0
except AttributeError:
pass

return p

def _compare_priority(tree1, tree2):
tree1.iter_subtrees()

def _compare_drv(tree1, tree2):
try:
rule1, rule2 = tree1.rule, tree2.rule
except AttributeError:
# Probably trees that don't take part in this parse (better way to distinguish?)
return -compare(tree1, tree2)
# Probably non-trees, or user trees that weren't created by the parse (better way to distinguish?)
return compare(tree1, tree2)

assert tree1.data != '_ambig'
assert tree2.data != '_ambig'

# XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse,
# when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be
# computationally inefficient. So we handle it here.
if tree1.data == '_ambig':
_standard_resolve_ambig(tree1)
if tree2.data == '_ambig':
_standard_resolve_ambig(tree2)
p1 = _sum_priority(tree1)
p2 = _sum_priority(tree2)
c = (p1 or p2) and compare(p1, p2)
if c:
return c

c = _compare_rules(tree1.rule, tree2.rule)
if c:
return c

# rules are "equal", so compare trees
for t1, t2 in zip(tree1.children, tree2.children):
c = _compare_drv(t1, t2)
if c:
return c
if len(tree1.children) == len(tree2.children):
for t1, t2 in zip(tree1.children, tree2.children):
c = _compare_drv(t1, t2)
if c:
return c

return compare(len(tree1.children), len(tree2.children))


def _standard_resolve_ambig(tree):
assert tree.data == '_ambig'
best = min(tree.children, key=cmp_to_key(_compare_drv))
key_f = cmp_to_key(_compare_drv)
best = max(tree.children, key=key_f)
assert best.data == 'drv'
tree.set('drv', best.children)
tree.rule = best.rule # needed for applying callbacks
@@ -80,23 +84,12 @@ def _antiscore_sum_drv(tree):
if not isinstance(tree, Tree):
return 0

# XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse,
# when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be
# computationally inefficient. So we handle it here.
if tree.data == '_ambig':
_antiscore_sum_resolve_ambig(tree)
assert tree.data != '_ambig'

try:
priority = tree.rule.options.priority
except AttributeError:
# Probably trees that don't take part in this parse (better way to distinguish?)
priority = None

return (priority or 0) + sum(map(_antiscore_sum_drv, tree.children), 0)
return _sum_priority(tree)

def _antiscore_sum_resolve_ambig(tree):
assert tree.data == '_ambig'

best = min(tree.children, key=_antiscore_sum_drv)
assert best.data == 'drv'
tree.set('drv', best.children)


+ 6
- 4
lark/parsers/xearley.py Näytä tiedosto

@@ -28,11 +28,12 @@ from .grammar_analysis import GrammarAnalyzer
from .earley import ApplyCallbacks, Item, Column

class Parser:
def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=()):
def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=(), predict_all=False):
self.analysis = GrammarAnalyzer(rules, start_symbol)
self.start_symbol = start_symbol
self.resolve_ambiguity = resolve_ambiguity
self.ignore = list(ignore)
self.predict_all = predict_all


self.postprocess = {}
@@ -107,9 +108,10 @@ class Parser:
for j in range(1, len(s)):
m = item.expect.match(s[:-j])
if m:
delayed_matches[m.end()].append(item.advance(m.group(0)))
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
delayed_matches[i+m.end()].append(item.advance(t))

next_set = Column(i+1, self.FIRST)
next_set = Column(i+1, self.FIRST, predict_all=self.predict_all)
next_set.add(delayed_matches[i+1])
del delayed_matches[i+1] # No longer needed, so unburden memory

@@ -119,7 +121,7 @@ class Parser:
return next_set

# Main loop starts
column0 = Column(0, self.FIRST)
column0 = Column(0, self.FIRST, predict_all=self.predict_all)
column0.add(predict(start_symbol, column0))

column = column0


+ 15
- 6
lark/tree.py Näytä tiedosto

@@ -67,17 +67,26 @@ class Tree(object):
yield c

def iter_subtrees(self):
# TODO: Re-write as a more efficient version

visited = set()
q = [self]

l = []
while q:
subtree = q.pop()
l.append( subtree )
if id(subtree) in visited:
continue # already been here from another branch
visited.add(id(subtree))
yield subtree
q += [c for c in subtree.children if isinstance(c, Tree)]

seen = set()
for x in reversed(l):
if id(x) not in seen:
yield x
seen.add(id(x))


def __deepcopy__(self, memo):
return type(self)(self.data, deepcopy(self.children, memo))
@@ -100,7 +109,7 @@ class Transformer(object):
if isinstance(c, Tree):
try:
items.append(self.transform(c))
except Erase:
except Discard:
pass
try:
f = self._get_func(tree.data)
@@ -116,7 +125,7 @@ class Transformer(object):
return TransformerChain(self, other)


class Erase(Exception):
class Discard(Exception):
pass

class TransformerChain(object):
@@ -156,7 +165,7 @@ class Visitor_NoRecurse(Visitor):
def visit(self, tree):
subtrees = list(tree.iter_subtrees())

for subtree in reversed(subtrees):
for subtree in (subtrees):
getattr(self, subtree.data, self.__default__)(subtree)
return tree

@@ -174,13 +183,13 @@ class Transformer_NoRecurse(Transformer):
else:
return f(t)

for subtree in reversed(subtrees):
for subtree in (subtrees):
children = []
for c in subtree.children:
if isinstance(c, Tree):
try:
children.append(_t(c))
except Erase:
except Discard:
pass
else:
children.append(c)


+ 13
- 0
tests/test_parser.py Näytä tiedosto

@@ -711,6 +711,19 @@ def _make_parser_test(LEXER, PARSER):
""")
x = g.parse('AB')

@unittest.skipIf(LEXER == None, "Scanless can't handle regexps")
def test_regex_quote(self):
g = r"""
start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING
SINGLE_QUOTED_STRING : /'[^']*'/
DOUBLE_QUOTED_STRING : /"[^"]*"/
"""

g = _Lark(g)
self.assertEqual( g.parse('"hello"').children, ['"hello"'])
self.assertEqual( g.parse("'hello'").children, ["'hello'"])


def test_lexer_token_limit(self):
"Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
tokens = {'A%d'%i:'"%d"'%i for i in range(300)}


Ladataan…
Peruuta
Tallenna