From a73cc9ad902a6fe8402a57ad777bdc2ee95aaecb Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 6 Mar 2017 12:32:12 +0200 Subject: [PATCH] Re-wrote the Earley parser to use a parse-forest It now knows how to resolve ambiguity! And in a memory-efficient way! --- README.md | 2 +- examples/conf_nolex.py | 2 +- lark/common.py | 25 ++++- lark/lark.py | 2 + lark/load_grammar.py | 16 ++- lark/parser_frontends.py | 49 ++++++++- lark/parsers/earley.py | 183 +++++++++++++++++++++++-------- lark/parsers/grammar_analysis.py | 2 +- lark/parsers/old_earley.py | 180 ++++++++++++++++++++++++++++++ lark/reconstruct.py | 25 ++--- lark/tree.py | 54 ++++++++- lark/utils.py | 11 ++ tests/test_parser.py | 22 ++++ 13 files changed, 491 insertions(+), 82 deletions(-) create mode 100644 lark/parsers/old_earley.py diff --git a/README.md b/README.md index 29f5684..decc815 100644 --- a/README.md +++ b/README.md @@ -134,7 +134,7 @@ These features may be implemented some day: - You can work with parse-trees instead of state-machines - The grammar is simple to read and write - There are no restrictions on grammar structure. Any grammar you write can be parsed. - - Some structures are faster than others. If you care about speed, you can learn them gradually while the parser is already working. + - Some structures are faster than others. If you care about speed, you can learn them gradually while the parser is already working - A well-written grammar is very fast - Note: Nondeterminstic grammars will run a little slower - Note: Ambiguous grammars (grammars that can be parsed in more than one way) are supported, but may cause significant slowdown if the ambiguity is too big) diff --git a/examples/conf_nolex.py b/examples/conf_nolex.py index b30087b..6c16baf 100644 --- a/examples/conf_nolex.py +++ b/examples/conf_nolex.py @@ -25,7 +25,7 @@ parser = Lark(r""" %import common.WS_INLINE %ignore WS_INLINE - """) + """, lexer=None) def test(): sample_conf = """ diff --git a/lark/common.py b/lark/common.py index f0e1fb6..9a19fd3 100644 --- a/lark/common.py +++ b/lark/common.py @@ -29,7 +29,7 @@ class UnexpectedToken(ParseError): def is_terminal(sym): - return isinstance(sym, tuple) or sym.isupper() or sym[0] == '$' + return isinstance(sym, Terminal) or sym.isupper() or sym[0] == '$' class LexerConf: @@ -81,3 +81,26 @@ class TokenDef(object): def __repr__(self): return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) + +class Terminal: + def __init__(self, data): + self.data = data + + def __repr__(self): + return '%r' % self.data + + def __eq__(self, other): + return isinstance(other, type(self)) and self.data == other.data + def __hash__(self): + return hash(self.data) + + +class Terminal_Regexp(Terminal): + def __init__(self, data): + Terminal.__init__(self, data) + self.match = re.compile(data).match + +class Terminal_Token(Terminal): + def match(self, other): + return self.data == other.type + diff --git a/lark/lark.py b/lark/lark.py index 7bc546b..7d434bf 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -159,6 +159,8 @@ class Lark: def lex(self, text): + if not hasattr(self, 'lexer'): + self.lexer = self._build_lexer() stream = self.lexer.lex(text) if self.options.postlex: return self.options.postlex.process(stream) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 3ee510e..ac947ec 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -67,8 +67,8 @@ TOKENS = { '_DOT': r'\.', 'RULE': '!?[_?]?[a-z][_a-z0-9]*', 'TOKEN': '_?[A-Z][_A-Z0-9]*', - 'STRING': r'"(\\"|\\\\|[^"])*?"', - 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/', + 'STRING': r'"(\\"|\\\\|[^"\n])*?"', + 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/', '_NL': r'(\r?\n)+\s*', 'WS': r'[ \t]+', 'COMMENT': r'//[^\n]*', @@ -377,11 +377,15 @@ class Grammar: else: options = RuleOptions.new_from(options, create_token=name) name = tokens_to_convert[name] + inner = Token('RULE', name + '_inner') + new_rule_defs.append((name, T('expansions', [T('expansion', [inner])]), None)) + name = inner - for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ): - for i, sym in enumerate(exp.children): - if sym in tokens_to_convert: - exp.children[i] = Token(sym.type, tokens_to_convert[sym]) + else: + for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ): + for i, sym in enumerate(exp.children): + if sym in tokens_to_convert: + exp.children[i] = Token(sym.type, tokens_to_convert[sym]) new_rule_defs.append((name, tree, options)) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 1dfe14b..7043dbc 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -3,8 +3,8 @@ import sre_parse from .lexer import Lexer, ContextualLexer, Token -from .common import is_terminal, GrammarError, ParserConf -from .parsers import lalr_parser, earley, nearley +from .common import is_terminal, GrammarError, ParserConf, Terminal_Regexp, Terminal_Token +from .parsers import lalr_parser, old_earley, nearley, earley from .tree import Transformer class WithLexer: @@ -70,13 +70,13 @@ class Nearley(WithLexer): return res[0] -class Earley(WithLexer): +class OldEarley(WithLexer): def __init__(self, lexer_conf, parser_conf): WithLexer.__init__(self, lexer_conf) rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] - self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) + self.parser = old_earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) def _prepare_expansion(self, expansion): return [(sym,) if is_terminal(sym) else sym for sym in expansion] @@ -100,13 +100,13 @@ def tokenize_text(text): return new_text -class Earley_NoLex: +class OldEarley_NoLex: def __init__(self, lexer_conf, parser_conf): self.token_by_name = {t.name:t for t in lexer_conf.tokens} rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] - self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) + self.parser = old_earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) def _prepare_expansion(self, expansion): for sym in expansion: @@ -125,6 +125,43 @@ class Earley_NoLex: assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' return res[0] +class Earley_NoLex: + def __init__(self, lexer_conf, parser_conf): + self.token_by_name = {t.name:t for t in lexer_conf.tokens} + + rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] + + self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback) + + def _prepare_expansion(self, expansion): + for sym in expansion: + if is_terminal(sym): + regexp = self.token_by_name[sym].pattern.to_regexp() + width = sre_parse.parse(regexp).getwidth() + if width != (1,1): + raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) + yield Terminal_Regexp(regexp) + else: + yield sym + + def parse(self, text): + new_text = tokenize_text(text) + return self.parser.parse(new_text) + +class Earley(WithLexer): + def __init__(self, lexer_conf, parser_conf): + WithLexer.__init__(self, lexer_conf) + + rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] + + self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback) + + def _prepare_expansion(self, expansion): + return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion] + + def parse(self, text): + tokens = list(self.lex(text)) + return self.parser.parse(tokens) def get_frontend(parser, lexer): if parser=='lalr': diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 40c5432..d1ec543 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -1,25 +1,42 @@ "This module implements an Earley Parser" +# The parser uses a parse-forest to keep track of derivations and ambiguations. +# When the parse ends successfully, a disambiguation stage resolves all ambiguity +# (right now ambiguity resolution is not developed beyond the needs of lark) +# Afterwards the parse tree is reduced (transformed) according to user callbacks. +# I use the no-recursion version of Transformer and Visitor, because the tree might be +# deeper than Python's recursion limit (a bit absurd, but that's life) +# # The algorithm keeps track of each state set, using a corresponding Column instance. # Column keeps track of new items using NewsList instances. # # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -from ..common import ParseError, UnexpectedToken, is_terminal +from functools import cmp_to_key + +from ..utils import compare +from ..common import ParseError, UnexpectedToken, Terminal from .grammar_analysis import GrammarAnalyzer +from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse + class EndToken: type = '$end' +class Derivation(Tree): + def __init__(self, rule, items=None): + Tree.__init__(self, 'drv', items or []) + self.rule = rule + END_TOKEN = EndToken() class Item(object): - def __init__(self, rule, ptr, start, data): + def __init__(self, rule, ptr, start, tree): self.rule = rule self.ptr = ptr self.start = start - self.data = data + self.tree = tree if tree is not None else Derivation(self.rule) @property def expect(self): @@ -29,8 +46,10 @@ class Item(object): def is_complete(self): return self.ptr == len(self.rule.expansion) - def advance(self, data): - return Item(self.rule, self.ptr+1, self.start, self.data + [data]) + def advance(self, tree): + assert self.tree.data == 'drv' + new_tree = Derivation(self.rule, self.tree.children + [tree]) + return Item(self.rule, self.ptr+1, self.start, new_tree) def __eq__(self, other): return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule @@ -38,8 +57,8 @@ class Item(object): return hash((self.rule, self.ptr, id(self.start))) def __repr__(self): - before = map(str, self.rule.expansion[:self.ptr]) - after = map(str, self.rule.expansion[self.ptr:]) + before = list(map(str, self.rule.expansion[:self.ptr])) + after = list(map(str, self.rule.expansion[self.ptr:])) return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) @@ -56,15 +75,18 @@ class NewsList(list): return self[i:] + class Column: "An entry in the table, aka Earley Chart" - def __init__(self): + def __init__(self, i): + self.i = i self.to_reduce = NewsList() self.to_predict = NewsList() self.to_scan = NewsList() self.item_count = 0 self.added = set() + self.completed = {} def add(self, items): """Sort items into scan/predict/reduce newslists @@ -76,29 +98,24 @@ class Column: for item in items: if item.is_complete: - - # (We must allow repetition of empty rules) - if item.rule.expansion: - - # This is an important test to avoid infinite-loops, - # For example for the rule: - # a: a | "b" - # If we can detect these cases statically, we can remove - # this test an gain a tiny performance boost - # - if item in added: - continue - added.add(item) - - self.to_reduce.append(item) - else: - if is_terminal(item.expect): - self.to_scan.append(item) + # XXX TODO Potential bug: What happens if there's ambiguity in an empty rule? + if item.rule.expansion and item in self.completed: + old_tree = self.completed[item].tree + if old_tree.data != 'ambig': + new_tree = old_tree.copy() + new_tree.rule = old_tree.rule + old_tree.set('ambig', [new_tree]) + old_tree.children.append(item.tree) else: - if item in added: - continue + self.completed[item] = item + self.to_reduce.append(item) + else: + if item not in added: added.add(item) - self.to_predict.append(item) + if isinstance(item.expect, Terminal): + self.to_scan.append(item) + else: + self.to_predict.append(item) self.item_count += 1 # Only count if actually added @@ -106,17 +123,16 @@ class Column: return bool(self.item_count) class Parser: - def __init__(self, parser_conf): - - self.analysis = GrammarAnalyzer(parser_conf.rules, parser_conf.start) - self.start = parser_conf.start + def __init__(self, rules, start, callback): + self.analysis = GrammarAnalyzer(rules, start) + self.start = start self.postprocess = {} self.predictions = {} for rule in self.analysis.rules: if rule.origin != '$root': # XXX kinda ugly a = rule.alias - self.postprocess[rule] = a if callable(a) else getattr(parser_conf.callback, a) + self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] def parse(self, stream, start=None): @@ -124,16 +140,15 @@ class Parser: start = start or self.start def predict(nonterm, i): - assert not is_terminal(nonterm), nonterm - return [Item(rule, 0, i, []) for rule in self.predictions[nonterm]] + assert not isinstance(nonterm, Terminal), nonterm + return [Item(rule, 0, i, None) for rule in self.predictions[nonterm]] def complete(item): name = item.rule.origin - item.data = self.postprocess[item.rule](item.data) - return [i.advance(item.data) for i in item.start.to_predict if i.expect == name] + return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name] def process_column(i, token, cur_set): - next_set = Column() + next_set = Column(i) while True: to_predict = {x.expect for x in cur_set.to_predict.get_news() @@ -147,21 +162,20 @@ class Parser: for item in to_reduce: cur_set.add( complete(item) ) - if token is not END_TOKEN: - for item in cur_set.to_scan.get_news(): - match = item.expect[0](token) if callable(item.expect[0]) else item.expect[0] == token.type - if match: + to_scan = cur_set.to_scan.get_news() + for item in to_scan: + if item.expect.match(token): next_set.add([item.advance(stream[i])]) if not next_set and token is not END_TOKEN: - expect = {i.expect[-1] for i in cur_set.to_scan} + expect = {i.expect for i in cur_set.to_scan} raise UnexpectedToken(token, expect, stream, i) return cur_set, next_set # Main loop starts - column0 = Column() + column0 = Column(0) column0.add(predict(start, column0)) cur_set = column0 @@ -171,10 +185,83 @@ class Parser: last_set, _ = process_column(len(stream), END_TOKEN, cur_set) # Parse ended. Now build a parse tree - solutions = [n.data for n in last_set.to_reduce + solutions = [n.tree for n in last_set.to_reduce if n.rule.origin==start and n.start is column0] if not solutions: raise ParseError('Incomplete parse: Could not find a solution to input') - - return solutions + elif len(solutions) == 1: + tree = solutions[0] + else: + tree = Tree('ambig', solutions) + + ResolveAmbig().visit(tree) + return ApplyCallbacks(self.postprocess).transform(tree) + + + +class ApplyCallbacks(Transformer_NoRecurse): + def __init__(self, postprocess): + self.postprocess = postprocess + + def drv(self, tree): + children = tree.children + callback = self.postprocess[tree.rule] + if callback: + return callback(children) + else: + return Tree(rule.origin, children) + +def _compare_rules(rule1, rule2): + assert rule1.origin == rule2.origin + c = compare( len(rule1.expansion), len(rule2.expansion)) + if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here + return c + else: + return -c + +def _compare_drv(tree1, tree2): + if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): + return compare(tree1, tree2) + + c = _compare_rules(tree1.rule, tree2.rule) + if c: + return c + + # rules are "equal", so compare trees + for t1, t2 in zip(tree1.children, tree2.children): + c = _compare_drv(t1, t2) + if c: + return c + + return compare(len(tree1.children), len(tree2.children)) + + +class ResolveAmbig(Visitor_NoRecurse): + def ambig(self, tree): + best = max(tree.children, key=cmp_to_key(_compare_drv)) + assert best.data == 'drv' + tree.set('drv', best.children) + tree.rule = best.rule # needed for applying callbacks + + +# RULES = [ +# ('a', ['d']), +# ('d', ['b']), +# ('b', ['C']), +# ('b', ['b', 'C']), +# ('b', ['C', 'b']), +# ] +# p = Parser(RULES, 'a') +# for x in p.parse('CC'): +# print x.pretty() + +#--------------- +# RULES = [ +# ('s', ['a', 'a']), +# ('a', ['b', 'b']), +# ('b', ['C'], lambda (x,): x), +# ('b', ['b', 'C']), +# ] +# p = Parser(RULES, 's', {}) +# print p.parse('CCCCC').pretty() diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 3428713..f08a8bd 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -13,7 +13,7 @@ class Rule(object): self.alias = alias def __repr__(self): - return '<%s : %s>' % (self.origin, ' '.join(map(unicode,self.expansion))) + return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) class RulePtr(object): def __init__(self, rule, index): diff --git a/lark/parsers/old_earley.py b/lark/parsers/old_earley.py new file mode 100644 index 0000000..24c1e4b --- /dev/null +++ b/lark/parsers/old_earley.py @@ -0,0 +1,180 @@ +"This module implements an Earley Parser" + +# The algorithm keeps track of each state set, using a corresponding Column instance. +# Column keeps track of new items using NewsList instances. +# +# Author: Erez Shinan (2017) +# Email : erezshin@gmail.com + +from ..common import ParseError, UnexpectedToken, is_terminal +from .grammar_analysis import GrammarAnalyzer + +class EndToken: + type = '$end' + +END_TOKEN = EndToken() + +class Item(object): + def __init__(self, rule, ptr, start, data): + self.rule = rule + self.ptr = ptr + self.start = start + self.data = data + + @property + def expect(self): + return self.rule.expansion[self.ptr] + + @property + def is_complete(self): + return self.ptr == len(self.rule.expansion) + + def advance(self, data): + return Item(self.rule, self.ptr+1, self.start, self.data + [data]) + + def __eq__(self, other): + return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule + def __hash__(self): + return hash((self.rule, self.ptr, id(self.start))) + + def __repr__(self): + before = map(str, self.rule.expansion[:self.ptr]) + after = map(str, self.rule.expansion[self.ptr:]) + return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) + + +class NewsList(list): + "Keeps track of newly added items (append-only)" + + def __init__(self, initial=None): + list.__init__(self, initial or []) + self.last_iter = 0 + + def get_news(self): + i = self.last_iter + self.last_iter = len(self) + return self[i:] + + +class Column: + "An entry in the table, aka Earley Chart" + def __init__(self): + self.to_reduce = NewsList() + self.to_predict = NewsList() + self.to_scan = NewsList() + self.item_count = 0 + + self.added = set() + + def add(self, items): + """Sort items into scan/predict/reduce newslists + + Makes sure only unique items are added. + """ + + added = self.added + for item in items: + + if item.is_complete: + + # (We must allow repetition of empty rules) + # if item.rule.expansion: + + # This is an important test to avoid infinite-loops, + # For example for the rule: + # a: a | "b" + # If we can detect these cases statically, we can remove + # this test an gain a tiny performance boost + # + # if item in added: + # continue + # added.add(item) + + self.to_reduce.append(item) + else: + if is_terminal(item.expect): + self.to_scan.append(item) + else: + if item in added: + continue + added.add(item) + self.to_predict.append(item) + + self.item_count += 1 # Only count if actually added + + def __nonzero__(self): + return bool(self.item_count) + +class Parser: + def __init__(self, parser_conf): + + self.analysis = GrammarAnalyzer(parser_conf.rules, parser_conf.start) + self.start = parser_conf.start + + self.postprocess = {} + self.predictions = {} + for rule in self.analysis.rules: + if rule.origin != '$root': # XXX kinda ugly + a = rule.alias + self.postprocess[rule] = a if callable(a) else getattr(parser_conf.callback, a) + self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] + + def parse(self, stream, start=None): + # Define parser functions + start = start or self.start + + def predict(nonterm, i): + assert not is_terminal(nonterm), nonterm + return [Item(rule, 0, i, []) for rule in self.predictions[nonterm]] + + def complete(item): + name = item.rule.origin + item.data = self.postprocess[item.rule](item.data) + return [i.advance(item.data) for i in item.start.to_predict if i.expect == name] + + def process_column(i, token, cur_set): + next_set = Column() + + while True: + to_predict = {x.expect for x in cur_set.to_predict.get_news() + if x.ptr} # if not part of an already predicted batch + to_reduce = cur_set.to_reduce.get_news() + if not (to_predict or to_reduce): + break + + for nonterm in to_predict: + cur_set.add( predict(nonterm, cur_set) ) + for item in to_reduce: + cur_set.add( complete(item) ) + + + if token is not END_TOKEN: + for item in cur_set.to_scan.get_news(): + match = item.expect[0](token) if callable(item.expect[0]) else item.expect[0] == token.type + if match: + next_set.add([item.advance(stream[i])]) + + if not next_set and token is not END_TOKEN: + expect = {i.expect[-1] for i in cur_set.to_scan} + raise UnexpectedToken(token, expect, stream, i) + + return cur_set, next_set + + # Main loop starts + column0 = Column() + column0.add(predict(start, column0)) + + cur_set = column0 + for i, char in enumerate(stream): + _, cur_set = process_column(i, char, cur_set) + + last_set, _ = process_column(len(stream), END_TOKEN, cur_set) + + # Parse ended. Now build a parse tree + solutions = [n.data for n in last_set.to_reduce + if n.rule.origin==start and n.start is column0] + + if not solutions: + raise ParseError('Incomplete parse: Could not find a solution to input') + + return solutions diff --git a/lark/reconstruct.py b/lark/reconstruct.py index b166882..2c9bcd1 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -2,7 +2,7 @@ import re from collections import defaultdict from .tree import Tree -from .common import is_terminal, ParserConf, PatternStr +from .common import is_terminal, ParserConf, PatternStr, Terminal from .lexer import Token from .parsers import earley @@ -26,21 +26,14 @@ class Reconstructor: token_res = {t.name:re.compile(t.pattern.to_regexp()) for t in _tokens} - class MatchData(object): - def __init__(self, data): - self.data = data - - def __repr__(self): - return '%s(%r)' % (type(self).__name__, self.data) - - class MatchTerminal(MatchData): - def __call__(self, other): + class MatchTerminal(Terminal): + def match(self, other): if isinstance(other, Tree): return False return token_res[self.data].match(other) is not None - class MatchTree(MatchData): - def __call__(self, other): + class MatchTree(Terminal): + def match(self, other): try: return self.data == other.data except AttributeError: @@ -90,7 +83,7 @@ class Reconstructor: for name, expansions in d.items(): for expansion in expansions: reduced = [sym if sym.startswith('_') or sym in expand1s else - (MatchTerminal(sym) if is_terminal(sym) else MatchTree(sym),) + MatchTerminal(sym) if is_terminal(sym) else MatchTree(sym) for sym in expansion if not is_discarded_terminal(sym)] rules.append((name, reduced, WriteTokens(name, expansion).f)) @@ -98,9 +91,9 @@ class Reconstructor: def _reconstruct(self, tree): - parser = earley.Parser(ParserConf(self.rules, {}, tree.data)) - - res ,= parser.parse(tree.children) # XXX ambiguity? + # TODO: ambiguity? + parser = earley.Parser(self.rules, tree.data, {}) + res = parser.parse(tree.children) for item in res: if isinstance(item, Tree): for x in self._reconstruct(item): diff --git a/lark/tree.py b/lark/tree.py index 9e818ec..e8b75c3 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -32,7 +32,11 @@ class Tree(object): self.children[i:i+1] = kid.children def __eq__(self, other): - return self.data == other.data and self.children == other.children + try: + return self.data == other.data and self.children == other.children + except AttributeError: + return False + def __hash__(self): return hash((self.data, tuple(self.children))) @@ -57,10 +61,24 @@ class Tree(object): if pred(c): yield c + def iter_subtrees(self): + q = [self] + + while q: + subtree = q.pop() + yield subtree + q += [c for c in subtree.children if isinstance(c, Tree)] + def __deepcopy__(self, memo): return type(self)(self.data, deepcopy(self.children, memo)) + def copy(self): + return type(self)(self.data, self.children) + def set(self, data, children): + self.data = data + self.children = children + class Transformer(object): @@ -81,7 +99,7 @@ class Transformer(object): class InlineTransformer(Transformer): - def _get_func(self, name): + def _get_func(self, name): # use super()._get_func return inline_args(getattr(self, name)).__get__(self) @@ -97,3 +115,35 @@ class Visitor(object): def __default__(self, tree): pass + + +class Visitor_NoRecurse(Visitor): + def visit(self, tree): + subtrees = list(tree.iter_subtrees()) + + for subtree in reversed(subtrees): + getattr(self, subtree.data, self.__default__)(subtree) + return tree + + +class Transformer_NoRecurse(Transformer): + def transform(self, tree): + subtrees = list(tree.iter_subtrees()) + + def _t(t): + # Assumes t is already transformed + try: + f = self._get_func(t.data) + except AttributeError: + return self.__default__(t) + else: + return f(t) + + for subtree in reversed(subtrees): + subtree.children = [_t(c) if isinstance(c, Tree) else c for c in subtree.children] + + return _t(tree) + + def __default__(self, t): + return t + diff --git a/lark/utils.py b/lark/utils.py index aa961f2..6f1e8b4 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -69,3 +69,14 @@ def inline_args(f): return f.__func__(self, *args) return _f + +try: + compare = cmp +except NameError: + def compare(a, b): + if a == b: + return 0 + elif a > b: + return 1 + else: + return -1 diff --git a/tests/test_parser.py b/tests/test_parser.py index c10ec51..c4ef4bf 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -73,6 +73,28 @@ class TestEarley(unittest.TestCase): l = Lark(grammar, parser='earley', lexer=None) l.parse(program) + def test_earley_scanless3(self): + "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" + + grammar = """ + start: A A + A: "a"+ + """ + + l = Lark(grammar, parser='earley', lexer=None) + res = l.parse("aaa") + self.assertEqual(res.children, ['aa', 'a']) + + def test_earley_scanless4(self): + grammar = """ + start: A A? + A: "a"+ + """ + + l = Lark(grammar, parser='earley', lexer=None) + res = l.parse("aaa") + self.assertEqual(res.children, ['aaa']) + def _make_parser_test(LEXER, PARSER): def _Lark(grammar, **kwargs): return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs)