Преглед изворни кода

Added token priority for standard lexers.

Big Thanks to Seth G of the mappyfile project for financing this feature!
tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan пре 7 година
родитељ
комит
ac9ea26b58
4 измењених фајлова са 68 додато и 17 уклоњено
  1. +2
    -1
      lark/common.py
  2. +1
    -1
      lark/lexer.py
  3. +17
    -13
      lark/load_grammar.py
  4. +48
    -2
      tests/test_parser.py

+ 2
- 1
lark/common.py Прегледај датотеку

@@ -89,10 +89,11 @@ class PatternRE(Pattern):
return sre_parse.parse(self.to_regexp()).getwidth()[1]

class TokenDef(object):
def __init__(self, name, pattern):
def __init__(self, name, pattern, priority=1):
assert isinstance(pattern, Pattern), pattern
self.name = name
self.pattern = pattern
self.priority = priority

def __repr__(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)


+ 1
- 1
lark/lexer.py Прегледај датотеку

@@ -131,7 +131,7 @@ class Lexer(object):
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
self.ignore_types = [t for t in ignore]

tokens.sort(key=lambda x:x.pattern.max_width, reverse=True)
tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, x.name))

tokens, self.callback = _create_unless(tokens)
assert all(self.callback.values())


+ 17
- 13
lark/load_grammar.py Прегледај датотеку

@@ -110,7 +110,8 @@ RULES = {
'maybe': ['_LBRA expansions _RBRA'],
'range': ['STRING _DOT _DOT STRING'],

'token': ['TOKEN _COLON expansions _NL'],
'token': ['TOKEN _COLON expansions _NL',
'TOKEN _DOT NUMBER _COLON expansions _NL'],
'statement': ['ignore', 'import'],
'ignore': ['_IGNORE expansions _NL'],
'import': ['_IMPORT import_args _NL',
@@ -373,7 +374,7 @@ class Grammar:
# Implement the "%ignore" feature without a lexer..
terms_to_ignore = {name:'__'+name for name in self.ignore}
if terms_to_ignore:
assert set(terms_to_ignore) <= {name for name, t in term_defs}
assert set(terms_to_ignore) <= {name for name, _t in term_defs}
term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs]
expr = Token('RULE', '__ignore')
for r, tree, _o in rule_defs:
@@ -388,7 +389,7 @@ class Grammar:
rule_defs.append(('__ignore', _ignore_tree, None))

# Convert all tokens to rules
new_terminal_names = {name: '__token_'+name for name, tree in term_defs}
new_terminal_names = {name: '__token_'+name for name, _t in term_defs}

for name, tree, options in rule_defs:
for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ):
@@ -396,11 +397,11 @@ class Grammar:
if sym in new_terminal_names:
exp.children[i] = Token(sym.type, new_terminal_names[sym])

for name, tree in term_defs:
for name, (tree, priority) in term_defs: # TODO transfer priority to rule?
if name.startswith('_'):
options = RuleOptions(filter_out=True)
options = RuleOptions(filter_out=True, priority=priority)
else:
options = RuleOptions(keep_all_tokens=True, create_token=name)
options = RuleOptions(keep_all_tokens=True, create_token=name, priority=priority)

name = new_terminal_names[name]
inner_name = name + '_inner'
@@ -423,8 +424,8 @@ class Grammar:

# Convert token-trees to strings/regexps
transformer = PrepareLiterals() * TokenTreeToPattern()
tokens = [TokenDef(name, transformer.transform(token_tree))
for name, token_tree in token_defs]
tokens = [TokenDef(name, transformer.transform(token_tree), priority)
for name, (token_tree, priority) in token_defs]

# =================
# Compile Rules
@@ -504,7 +505,7 @@ def resolve_token_references(token_defs):

while True:
changed = False
for name, token_tree in token_defs:
for name, (token_tree, _p) in token_defs:
for exp in chain(token_tree.find_data('expansion'), token_tree.find_data('expr')):
for i, item in enumerate(exp.children):
if isinstance(item, Token):
@@ -555,7 +556,9 @@ class GrammarLoader:
statements = [c.children for c in tree.children if c.data=='statement']
assert len(token_defs) + len(rule_defs) + len(statements) == len(tree.children)

token_defs = [(name.value, t) for name, t in token_defs]
token_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in token_defs]

token_defs = [(name.value, (t, int(p))) for name, p, t in token_defs]

# Execute statements
ignore = []
@@ -568,8 +571,9 @@ class GrammarLoader:
name = stmt.children[1] if len(stmt.children)>1 else dotted_path[-1]
grammar_path = os.path.join(*dotted_path[:-1]) + '.g'
g = import_grammar(grammar_path)
token_tree = dict(g.token_defs)[dotted_path[-1]]
token_defs.append([name.value, token_tree])
token_options = dict(g.token_defs)[dotted_path[-1]]
assert isinstance(token_options, tuple) and len(token_options)==2
token_defs.append([name.value, token_options])
else:
assert False, stmt

@@ -594,7 +598,7 @@ class GrammarLoader:

name = '__IGNORE_%d'% len(ignore_names)
ignore_names.append(name)
token_defs.append((name, t))
token_defs.append((name, (t, 0)))


# Verify correctness 2


+ 48
- 2
tests/test_parser.py Прегледај датотеку

@@ -635,7 +635,8 @@ def _make_parser_test(LEXER, PARSER):
b.2: "a"
"""

l = Lark(grammar, parser='earley', lexer='standard')
# l = Lark(grammar, parser='earley', lexer='standard')
l = _Lark(grammar)
res = l.parse("a")
self.assertEqual(res.children[0].data, 'b')

@@ -645,10 +646,55 @@ def _make_parser_test(LEXER, PARSER):
b.1: "a"
"""

l = Lark(grammar, parser='earley', lexer='standard')
l = _Lark(grammar)
# l = Lark(grammar, parser='earley', lexer='standard')
res = l.parse("a")
self.assertEqual(res.children[0].data, 'a')

@unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority")
def test_lexer_prioritization(self):
"Tests effect of priority on result"

grammar = """
start: A B | AB
A.2: "a"
B: "b"
AB: "ab"
"""
l = _Lark(grammar)
res = l.parse("ab")

self.assertEqual(res.children, ['a', 'b'])
self.assertNotEqual(res.children, ['ab'])

grammar = """
start: A B | AB
A: "a"
B: "b"
AB.3: "ab"
"""
l = _Lark(grammar)
res = l.parse("ab")

self.assertNotEqual(res.children, ['a', 'b'])
self.assertEqual(res.children, ['ab'])


@unittest.skipIf(PARSER != 'earley', "Currently only Earley supports ambiguity")
def test_ambiguity1(self):
grammar = """
start: cd+ "e"

!cd: "c"
| "d"
| "cd"

"""
l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=None)
x = l.parse('cde')
assert x.data == '_ambig'
assert len(x.children) == 2


@unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
def test_earley_prioritization_sum(self):


Loading…
Откажи
Сачувај