It now knows how to resolve ambiguity! And in a memory-efficient way!tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
@@ -134,7 +134,7 @@ These features may be implemented some day: | |||
- You can work with parse-trees instead of state-machines | |||
- The grammar is simple to read and write | |||
- There are no restrictions on grammar structure. Any grammar you write can be parsed. | |||
- Some structures are faster than others. If you care about speed, you can learn them gradually while the parser is already working. | |||
- Some structures are faster than others. If you care about speed, you can learn them gradually while the parser is already working | |||
- A well-written grammar is very fast | |||
- Note: Nondeterminstic grammars will run a little slower | |||
- Note: Ambiguous grammars (grammars that can be parsed in more than one way) are supported, but may cause significant slowdown if the ambiguity is too big) | |||
@@ -25,7 +25,7 @@ parser = Lark(r""" | |||
%import common.WS_INLINE | |||
%ignore WS_INLINE | |||
""") | |||
""", lexer=None) | |||
def test(): | |||
sample_conf = """ | |||
@@ -29,7 +29,7 @@ class UnexpectedToken(ParseError): | |||
def is_terminal(sym): | |||
return isinstance(sym, tuple) or sym.isupper() or sym[0] == '$' | |||
return isinstance(sym, Terminal) or sym.isupper() or sym[0] == '$' | |||
class LexerConf: | |||
@@ -81,3 +81,26 @@ class TokenDef(object): | |||
def __repr__(self): | |||
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) | |||
class Terminal: | |||
def __init__(self, data): | |||
self.data = data | |||
def __repr__(self): | |||
return '%r' % self.data | |||
def __eq__(self, other): | |||
return isinstance(other, type(self)) and self.data == other.data | |||
def __hash__(self): | |||
return hash(self.data) | |||
class Terminal_Regexp(Terminal): | |||
def __init__(self, data): | |||
Terminal.__init__(self, data) | |||
self.match = re.compile(data).match | |||
class Terminal_Token(Terminal): | |||
def match(self, other): | |||
return self.data == other.type | |||
@@ -159,6 +159,8 @@ class Lark: | |||
def lex(self, text): | |||
if not hasattr(self, 'lexer'): | |||
self.lexer = self._build_lexer() | |||
stream = self.lexer.lex(text) | |||
if self.options.postlex: | |||
return self.options.postlex.process(stream) | |||
@@ -67,8 +67,8 @@ TOKENS = { | |||
'_DOT': r'\.', | |||
'RULE': '!?[_?]?[a-z][_a-z0-9]*', | |||
'TOKEN': '_?[A-Z][_A-Z0-9]*', | |||
'STRING': r'"(\\"|\\\\|[^"])*?"', | |||
'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/', | |||
'STRING': r'"(\\"|\\\\|[^"\n])*?"', | |||
'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/', | |||
'_NL': r'(\r?\n)+\s*', | |||
'WS': r'[ \t]+', | |||
'COMMENT': r'//[^\n]*', | |||
@@ -377,11 +377,15 @@ class Grammar: | |||
else: | |||
options = RuleOptions.new_from(options, create_token=name) | |||
name = tokens_to_convert[name] | |||
inner = Token('RULE', name + '_inner') | |||
new_rule_defs.append((name, T('expansions', [T('expansion', [inner])]), None)) | |||
name = inner | |||
for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ): | |||
for i, sym in enumerate(exp.children): | |||
if sym in tokens_to_convert: | |||
exp.children[i] = Token(sym.type, tokens_to_convert[sym]) | |||
else: | |||
for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ): | |||
for i, sym in enumerate(exp.children): | |||
if sym in tokens_to_convert: | |||
exp.children[i] = Token(sym.type, tokens_to_convert[sym]) | |||
new_rule_defs.append((name, tree, options)) | |||
@@ -3,8 +3,8 @@ import sre_parse | |||
from .lexer import Lexer, ContextualLexer, Token | |||
from .common import is_terminal, GrammarError, ParserConf | |||
from .parsers import lalr_parser, earley, nearley | |||
from .common import is_terminal, GrammarError, ParserConf, Terminal_Regexp, Terminal_Token | |||
from .parsers import lalr_parser, old_earley, nearley, earley | |||
from .tree import Transformer | |||
class WithLexer: | |||
@@ -70,13 +70,13 @@ class Nearley(WithLexer): | |||
return res[0] | |||
class Earley(WithLexer): | |||
class OldEarley(WithLexer): | |||
def __init__(self, lexer_conf, parser_conf): | |||
WithLexer.__init__(self, lexer_conf) | |||
rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] | |||
self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | |||
self.parser = old_earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | |||
def _prepare_expansion(self, expansion): | |||
return [(sym,) if is_terminal(sym) else sym for sym in expansion] | |||
@@ -100,13 +100,13 @@ def tokenize_text(text): | |||
return new_text | |||
class Earley_NoLex: | |||
class OldEarley_NoLex: | |||
def __init__(self, lexer_conf, parser_conf): | |||
self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||
rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] | |||
self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | |||
self.parser = old_earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | |||
def _prepare_expansion(self, expansion): | |||
for sym in expansion: | |||
@@ -125,6 +125,43 @@ class Earley_NoLex: | |||
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | |||
return res[0] | |||
class Earley_NoLex: | |||
def __init__(self, lexer_conf, parser_conf): | |||
self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||
rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] | |||
self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback) | |||
def _prepare_expansion(self, expansion): | |||
for sym in expansion: | |||
if is_terminal(sym): | |||
regexp = self.token_by_name[sym].pattern.to_regexp() | |||
width = sre_parse.parse(regexp).getwidth() | |||
if width != (1,1): | |||
raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) | |||
yield Terminal_Regexp(regexp) | |||
else: | |||
yield sym | |||
def parse(self, text): | |||
new_text = tokenize_text(text) | |||
return self.parser.parse(new_text) | |||
class Earley(WithLexer): | |||
def __init__(self, lexer_conf, parser_conf): | |||
WithLexer.__init__(self, lexer_conf) | |||
rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] | |||
self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback) | |||
def _prepare_expansion(self, expansion): | |||
return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion] | |||
def parse(self, text): | |||
tokens = list(self.lex(text)) | |||
return self.parser.parse(tokens) | |||
def get_frontend(parser, lexer): | |||
if parser=='lalr': | |||
@@ -1,25 +1,42 @@ | |||
"This module implements an Earley Parser" | |||
# The parser uses a parse-forest to keep track of derivations and ambiguations. | |||
# When the parse ends successfully, a disambiguation stage resolves all ambiguity | |||
# (right now ambiguity resolution is not developed beyond the needs of lark) | |||
# Afterwards the parse tree is reduced (transformed) according to user callbacks. | |||
# I use the no-recursion version of Transformer and Visitor, because the tree might be | |||
# deeper than Python's recursion limit (a bit absurd, but that's life) | |||
# | |||
# The algorithm keeps track of each state set, using a corresponding Column instance. | |||
# Column keeps track of new items using NewsList instances. | |||
# | |||
# Author: Erez Shinan (2017) | |||
# Email : erezshin@gmail.com | |||
from ..common import ParseError, UnexpectedToken, is_terminal | |||
from functools import cmp_to_key | |||
from ..utils import compare | |||
from ..common import ParseError, UnexpectedToken, Terminal | |||
from .grammar_analysis import GrammarAnalyzer | |||
from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse | |||
class EndToken: | |||
type = '$end' | |||
class Derivation(Tree): | |||
def __init__(self, rule, items=None): | |||
Tree.__init__(self, 'drv', items or []) | |||
self.rule = rule | |||
END_TOKEN = EndToken() | |||
class Item(object): | |||
def __init__(self, rule, ptr, start, data): | |||
def __init__(self, rule, ptr, start, tree): | |||
self.rule = rule | |||
self.ptr = ptr | |||
self.start = start | |||
self.data = data | |||
self.tree = tree if tree is not None else Derivation(self.rule) | |||
@property | |||
def expect(self): | |||
@@ -29,8 +46,10 @@ class Item(object): | |||
def is_complete(self): | |||
return self.ptr == len(self.rule.expansion) | |||
def advance(self, data): | |||
return Item(self.rule, self.ptr+1, self.start, self.data + [data]) | |||
def advance(self, tree): | |||
assert self.tree.data == 'drv' | |||
new_tree = Derivation(self.rule, self.tree.children + [tree]) | |||
return Item(self.rule, self.ptr+1, self.start, new_tree) | |||
def __eq__(self, other): | |||
return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | |||
@@ -38,8 +57,8 @@ class Item(object): | |||
return hash((self.rule, self.ptr, id(self.start))) | |||
def __repr__(self): | |||
before = map(str, self.rule.expansion[:self.ptr]) | |||
after = map(str, self.rule.expansion[self.ptr:]) | |||
before = list(map(str, self.rule.expansion[:self.ptr])) | |||
after = list(map(str, self.rule.expansion[self.ptr:])) | |||
return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) | |||
@@ -56,15 +75,18 @@ class NewsList(list): | |||
return self[i:] | |||
class Column: | |||
"An entry in the table, aka Earley Chart" | |||
def __init__(self): | |||
def __init__(self, i): | |||
self.i = i | |||
self.to_reduce = NewsList() | |||
self.to_predict = NewsList() | |||
self.to_scan = NewsList() | |||
self.item_count = 0 | |||
self.added = set() | |||
self.completed = {} | |||
def add(self, items): | |||
"""Sort items into scan/predict/reduce newslists | |||
@@ -76,29 +98,24 @@ class Column: | |||
for item in items: | |||
if item.is_complete: | |||
# (We must allow repetition of empty rules) | |||
if item.rule.expansion: | |||
# This is an important test to avoid infinite-loops, | |||
# For example for the rule: | |||
# a: a | "b" | |||
# If we can detect these cases statically, we can remove | |||
# this test an gain a tiny performance boost | |||
# | |||
if item in added: | |||
continue | |||
added.add(item) | |||
self.to_reduce.append(item) | |||
else: | |||
if is_terminal(item.expect): | |||
self.to_scan.append(item) | |||
# XXX TODO Potential bug: What happens if there's ambiguity in an empty rule? | |||
if item.rule.expansion and item in self.completed: | |||
old_tree = self.completed[item].tree | |||
if old_tree.data != 'ambig': | |||
new_tree = old_tree.copy() | |||
new_tree.rule = old_tree.rule | |||
old_tree.set('ambig', [new_tree]) | |||
old_tree.children.append(item.tree) | |||
else: | |||
if item in added: | |||
continue | |||
self.completed[item] = item | |||
self.to_reduce.append(item) | |||
else: | |||
if item not in added: | |||
added.add(item) | |||
self.to_predict.append(item) | |||
if isinstance(item.expect, Terminal): | |||
self.to_scan.append(item) | |||
else: | |||
self.to_predict.append(item) | |||
self.item_count += 1 # Only count if actually added | |||
@@ -106,17 +123,16 @@ class Column: | |||
return bool(self.item_count) | |||
class Parser: | |||
def __init__(self, parser_conf): | |||
self.analysis = GrammarAnalyzer(parser_conf.rules, parser_conf.start) | |||
self.start = parser_conf.start | |||
def __init__(self, rules, start, callback): | |||
self.analysis = GrammarAnalyzer(rules, start) | |||
self.start = start | |||
self.postprocess = {} | |||
self.predictions = {} | |||
for rule in self.analysis.rules: | |||
if rule.origin != '$root': # XXX kinda ugly | |||
a = rule.alias | |||
self.postprocess[rule] = a if callable(a) else getattr(parser_conf.callback, a) | |||
self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) | |||
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||
def parse(self, stream, start=None): | |||
@@ -124,16 +140,15 @@ class Parser: | |||
start = start or self.start | |||
def predict(nonterm, i): | |||
assert not is_terminal(nonterm), nonterm | |||
return [Item(rule, 0, i, []) for rule in self.predictions[nonterm]] | |||
assert not isinstance(nonterm, Terminal), nonterm | |||
return [Item(rule, 0, i, None) for rule in self.predictions[nonterm]] | |||
def complete(item): | |||
name = item.rule.origin | |||
item.data = self.postprocess[item.rule](item.data) | |||
return [i.advance(item.data) for i in item.start.to_predict if i.expect == name] | |||
return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name] | |||
def process_column(i, token, cur_set): | |||
next_set = Column() | |||
next_set = Column(i) | |||
while True: | |||
to_predict = {x.expect for x in cur_set.to_predict.get_news() | |||
@@ -147,21 +162,20 @@ class Parser: | |||
for item in to_reduce: | |||
cur_set.add( complete(item) ) | |||
if token is not END_TOKEN: | |||
for item in cur_set.to_scan.get_news(): | |||
match = item.expect[0](token) if callable(item.expect[0]) else item.expect[0] == token.type | |||
if match: | |||
to_scan = cur_set.to_scan.get_news() | |||
for item in to_scan: | |||
if item.expect.match(token): | |||
next_set.add([item.advance(stream[i])]) | |||
if not next_set and token is not END_TOKEN: | |||
expect = {i.expect[-1] for i in cur_set.to_scan} | |||
expect = {i.expect for i in cur_set.to_scan} | |||
raise UnexpectedToken(token, expect, stream, i) | |||
return cur_set, next_set | |||
# Main loop starts | |||
column0 = Column() | |||
column0 = Column(0) | |||
column0.add(predict(start, column0)) | |||
cur_set = column0 | |||
@@ -171,10 +185,83 @@ class Parser: | |||
last_set, _ = process_column(len(stream), END_TOKEN, cur_set) | |||
# Parse ended. Now build a parse tree | |||
solutions = [n.data for n in last_set.to_reduce | |||
solutions = [n.tree for n in last_set.to_reduce | |||
if n.rule.origin==start and n.start is column0] | |||
if not solutions: | |||
raise ParseError('Incomplete parse: Could not find a solution to input') | |||
return solutions | |||
elif len(solutions) == 1: | |||
tree = solutions[0] | |||
else: | |||
tree = Tree('ambig', solutions) | |||
ResolveAmbig().visit(tree) | |||
return ApplyCallbacks(self.postprocess).transform(tree) | |||
class ApplyCallbacks(Transformer_NoRecurse): | |||
def __init__(self, postprocess): | |||
self.postprocess = postprocess | |||
def drv(self, tree): | |||
children = tree.children | |||
callback = self.postprocess[tree.rule] | |||
if callback: | |||
return callback(children) | |||
else: | |||
return Tree(rule.origin, children) | |||
def _compare_rules(rule1, rule2): | |||
assert rule1.origin == rule2.origin | |||
c = compare( len(rule1.expansion), len(rule2.expansion)) | |||
if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here | |||
return c | |||
else: | |||
return -c | |||
def _compare_drv(tree1, tree2): | |||
if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): | |||
return compare(tree1, tree2) | |||
c = _compare_rules(tree1.rule, tree2.rule) | |||
if c: | |||
return c | |||
# rules are "equal", so compare trees | |||
for t1, t2 in zip(tree1.children, tree2.children): | |||
c = _compare_drv(t1, t2) | |||
if c: | |||
return c | |||
return compare(len(tree1.children), len(tree2.children)) | |||
class ResolveAmbig(Visitor_NoRecurse): | |||
def ambig(self, tree): | |||
best = max(tree.children, key=cmp_to_key(_compare_drv)) | |||
assert best.data == 'drv' | |||
tree.set('drv', best.children) | |||
tree.rule = best.rule # needed for applying callbacks | |||
# RULES = [ | |||
# ('a', ['d']), | |||
# ('d', ['b']), | |||
# ('b', ['C']), | |||
# ('b', ['b', 'C']), | |||
# ('b', ['C', 'b']), | |||
# ] | |||
# p = Parser(RULES, 'a') | |||
# for x in p.parse('CC'): | |||
# print x.pretty() | |||
#--------------- | |||
# RULES = [ | |||
# ('s', ['a', 'a']), | |||
# ('a', ['b', 'b']), | |||
# ('b', ['C'], lambda (x,): x), | |||
# ('b', ['b', 'C']), | |||
# ] | |||
# p = Parser(RULES, 's', {}) | |||
# print p.parse('CCCCC').pretty() |
@@ -13,7 +13,7 @@ class Rule(object): | |||
self.alias = alias | |||
def __repr__(self): | |||
return '<%s : %s>' % (self.origin, ' '.join(map(unicode,self.expansion))) | |||
return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) | |||
class RulePtr(object): | |||
def __init__(self, rule, index): | |||
@@ -0,0 +1,180 @@ | |||
"This module implements an Earley Parser" | |||
# The algorithm keeps track of each state set, using a corresponding Column instance. | |||
# Column keeps track of new items using NewsList instances. | |||
# | |||
# Author: Erez Shinan (2017) | |||
# Email : erezshin@gmail.com | |||
from ..common import ParseError, UnexpectedToken, is_terminal | |||
from .grammar_analysis import GrammarAnalyzer | |||
class EndToken: | |||
type = '$end' | |||
END_TOKEN = EndToken() | |||
class Item(object): | |||
def __init__(self, rule, ptr, start, data): | |||
self.rule = rule | |||
self.ptr = ptr | |||
self.start = start | |||
self.data = data | |||
@property | |||
def expect(self): | |||
return self.rule.expansion[self.ptr] | |||
@property | |||
def is_complete(self): | |||
return self.ptr == len(self.rule.expansion) | |||
def advance(self, data): | |||
return Item(self.rule, self.ptr+1, self.start, self.data + [data]) | |||
def __eq__(self, other): | |||
return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | |||
def __hash__(self): | |||
return hash((self.rule, self.ptr, id(self.start))) | |||
def __repr__(self): | |||
before = map(str, self.rule.expansion[:self.ptr]) | |||
after = map(str, self.rule.expansion[self.ptr:]) | |||
return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) | |||
class NewsList(list): | |||
"Keeps track of newly added items (append-only)" | |||
def __init__(self, initial=None): | |||
list.__init__(self, initial or []) | |||
self.last_iter = 0 | |||
def get_news(self): | |||
i = self.last_iter | |||
self.last_iter = len(self) | |||
return self[i:] | |||
class Column: | |||
"An entry in the table, aka Earley Chart" | |||
def __init__(self): | |||
self.to_reduce = NewsList() | |||
self.to_predict = NewsList() | |||
self.to_scan = NewsList() | |||
self.item_count = 0 | |||
self.added = set() | |||
def add(self, items): | |||
"""Sort items into scan/predict/reduce newslists | |||
Makes sure only unique items are added. | |||
""" | |||
added = self.added | |||
for item in items: | |||
if item.is_complete: | |||
# (We must allow repetition of empty rules) | |||
# if item.rule.expansion: | |||
# This is an important test to avoid infinite-loops, | |||
# For example for the rule: | |||
# a: a | "b" | |||
# If we can detect these cases statically, we can remove | |||
# this test an gain a tiny performance boost | |||
# | |||
# if item in added: | |||
# continue | |||
# added.add(item) | |||
self.to_reduce.append(item) | |||
else: | |||
if is_terminal(item.expect): | |||
self.to_scan.append(item) | |||
else: | |||
if item in added: | |||
continue | |||
added.add(item) | |||
self.to_predict.append(item) | |||
self.item_count += 1 # Only count if actually added | |||
def __nonzero__(self): | |||
return bool(self.item_count) | |||
class Parser: | |||
def __init__(self, parser_conf): | |||
self.analysis = GrammarAnalyzer(parser_conf.rules, parser_conf.start) | |||
self.start = parser_conf.start | |||
self.postprocess = {} | |||
self.predictions = {} | |||
for rule in self.analysis.rules: | |||
if rule.origin != '$root': # XXX kinda ugly | |||
a = rule.alias | |||
self.postprocess[rule] = a if callable(a) else getattr(parser_conf.callback, a) | |||
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||
def parse(self, stream, start=None): | |||
# Define parser functions | |||
start = start or self.start | |||
def predict(nonterm, i): | |||
assert not is_terminal(nonterm), nonterm | |||
return [Item(rule, 0, i, []) for rule in self.predictions[nonterm]] | |||
def complete(item): | |||
name = item.rule.origin | |||
item.data = self.postprocess[item.rule](item.data) | |||
return [i.advance(item.data) for i in item.start.to_predict if i.expect == name] | |||
def process_column(i, token, cur_set): | |||
next_set = Column() | |||
while True: | |||
to_predict = {x.expect for x in cur_set.to_predict.get_news() | |||
if x.ptr} # if not part of an already predicted batch | |||
to_reduce = cur_set.to_reduce.get_news() | |||
if not (to_predict or to_reduce): | |||
break | |||
for nonterm in to_predict: | |||
cur_set.add( predict(nonterm, cur_set) ) | |||
for item in to_reduce: | |||
cur_set.add( complete(item) ) | |||
if token is not END_TOKEN: | |||
for item in cur_set.to_scan.get_news(): | |||
match = item.expect[0](token) if callable(item.expect[0]) else item.expect[0] == token.type | |||
if match: | |||
next_set.add([item.advance(stream[i])]) | |||
if not next_set and token is not END_TOKEN: | |||
expect = {i.expect[-1] for i in cur_set.to_scan} | |||
raise UnexpectedToken(token, expect, stream, i) | |||
return cur_set, next_set | |||
# Main loop starts | |||
column0 = Column() | |||
column0.add(predict(start, column0)) | |||
cur_set = column0 | |||
for i, char in enumerate(stream): | |||
_, cur_set = process_column(i, char, cur_set) | |||
last_set, _ = process_column(len(stream), END_TOKEN, cur_set) | |||
# Parse ended. Now build a parse tree | |||
solutions = [n.data for n in last_set.to_reduce | |||
if n.rule.origin==start and n.start is column0] | |||
if not solutions: | |||
raise ParseError('Incomplete parse: Could not find a solution to input') | |||
return solutions |
@@ -2,7 +2,7 @@ import re | |||
from collections import defaultdict | |||
from .tree import Tree | |||
from .common import is_terminal, ParserConf, PatternStr | |||
from .common import is_terminal, ParserConf, PatternStr, Terminal | |||
from .lexer import Token | |||
from .parsers import earley | |||
@@ -26,21 +26,14 @@ class Reconstructor: | |||
token_res = {t.name:re.compile(t.pattern.to_regexp()) for t in _tokens} | |||
class MatchData(object): | |||
def __init__(self, data): | |||
self.data = data | |||
def __repr__(self): | |||
return '%s(%r)' % (type(self).__name__, self.data) | |||
class MatchTerminal(MatchData): | |||
def __call__(self, other): | |||
class MatchTerminal(Terminal): | |||
def match(self, other): | |||
if isinstance(other, Tree): | |||
return False | |||
return token_res[self.data].match(other) is not None | |||
class MatchTree(MatchData): | |||
def __call__(self, other): | |||
class MatchTree(Terminal): | |||
def match(self, other): | |||
try: | |||
return self.data == other.data | |||
except AttributeError: | |||
@@ -90,7 +83,7 @@ class Reconstructor: | |||
for name, expansions in d.items(): | |||
for expansion in expansions: | |||
reduced = [sym if sym.startswith('_') or sym in expand1s else | |||
(MatchTerminal(sym) if is_terminal(sym) else MatchTree(sym),) | |||
MatchTerminal(sym) if is_terminal(sym) else MatchTree(sym) | |||
for sym in expansion if not is_discarded_terminal(sym)] | |||
rules.append((name, reduced, WriteTokens(name, expansion).f)) | |||
@@ -98,9 +91,9 @@ class Reconstructor: | |||
def _reconstruct(self, tree): | |||
parser = earley.Parser(ParserConf(self.rules, {}, tree.data)) | |||
res ,= parser.parse(tree.children) # XXX ambiguity? | |||
# TODO: ambiguity? | |||
parser = earley.Parser(self.rules, tree.data, {}) | |||
res = parser.parse(tree.children) | |||
for item in res: | |||
if isinstance(item, Tree): | |||
for x in self._reconstruct(item): | |||
@@ -32,7 +32,11 @@ class Tree(object): | |||
self.children[i:i+1] = kid.children | |||
def __eq__(self, other): | |||
return self.data == other.data and self.children == other.children | |||
try: | |||
return self.data == other.data and self.children == other.children | |||
except AttributeError: | |||
return False | |||
def __hash__(self): | |||
return hash((self.data, tuple(self.children))) | |||
@@ -57,10 +61,24 @@ class Tree(object): | |||
if pred(c): | |||
yield c | |||
def iter_subtrees(self): | |||
q = [self] | |||
while q: | |||
subtree = q.pop() | |||
yield subtree | |||
q += [c for c in subtree.children if isinstance(c, Tree)] | |||
def __deepcopy__(self, memo): | |||
return type(self)(self.data, deepcopy(self.children, memo)) | |||
def copy(self): | |||
return type(self)(self.data, self.children) | |||
def set(self, data, children): | |||
self.data = data | |||
self.children = children | |||
class Transformer(object): | |||
@@ -81,7 +99,7 @@ class Transformer(object): | |||
class InlineTransformer(Transformer): | |||
def _get_func(self, name): | |||
def _get_func(self, name): # use super()._get_func | |||
return inline_args(getattr(self, name)).__get__(self) | |||
@@ -97,3 +115,35 @@ class Visitor(object): | |||
def __default__(self, tree): | |||
pass | |||
class Visitor_NoRecurse(Visitor): | |||
def visit(self, tree): | |||
subtrees = list(tree.iter_subtrees()) | |||
for subtree in reversed(subtrees): | |||
getattr(self, subtree.data, self.__default__)(subtree) | |||
return tree | |||
class Transformer_NoRecurse(Transformer): | |||
def transform(self, tree): | |||
subtrees = list(tree.iter_subtrees()) | |||
def _t(t): | |||
# Assumes t is already transformed | |||
try: | |||
f = self._get_func(t.data) | |||
except AttributeError: | |||
return self.__default__(t) | |||
else: | |||
return f(t) | |||
for subtree in reversed(subtrees): | |||
subtree.children = [_t(c) if isinstance(c, Tree) else c for c in subtree.children] | |||
return _t(tree) | |||
def __default__(self, t): | |||
return t | |||
@@ -69,3 +69,14 @@ def inline_args(f): | |||
return f.__func__(self, *args) | |||
return _f | |||
try: | |||
compare = cmp | |||
except NameError: | |||
def compare(a, b): | |||
if a == b: | |||
return 0 | |||
elif a > b: | |||
return 1 | |||
else: | |||
return -1 |
@@ -73,6 +73,28 @@ class TestEarley(unittest.TestCase): | |||
l = Lark(grammar, parser='earley', lexer=None) | |||
l.parse(program) | |||
def test_earley_scanless3(self): | |||
"Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" | |||
grammar = """ | |||
start: A A | |||
A: "a"+ | |||
""" | |||
l = Lark(grammar, parser='earley', lexer=None) | |||
res = l.parse("aaa") | |||
self.assertEqual(res.children, ['aa', 'a']) | |||
def test_earley_scanless4(self): | |||
grammar = """ | |||
start: A A? | |||
A: "a"+ | |||
""" | |||
l = Lark(grammar, parser='earley', lexer=None) | |||
res = l.parse("aaa") | |||
self.assertEqual(res.children, ['aaa']) | |||
def _make_parser_test(LEXER, PARSER): | |||
def _Lark(grammar, **kwargs): | |||
return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs) | |||