It now knows how to resolve ambiguity! And in a memory-efficient way!tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
| @@ -134,7 +134,7 @@ These features may be implemented some day: | |||
| - You can work with parse-trees instead of state-machines | |||
| - The grammar is simple to read and write | |||
| - There are no restrictions on grammar structure. Any grammar you write can be parsed. | |||
| - Some structures are faster than others. If you care about speed, you can learn them gradually while the parser is already working. | |||
| - Some structures are faster than others. If you care about speed, you can learn them gradually while the parser is already working | |||
| - A well-written grammar is very fast | |||
| - Note: Nondeterminstic grammars will run a little slower | |||
| - Note: Ambiguous grammars (grammars that can be parsed in more than one way) are supported, but may cause significant slowdown if the ambiguity is too big) | |||
| @@ -25,7 +25,7 @@ parser = Lark(r""" | |||
| %import common.WS_INLINE | |||
| %ignore WS_INLINE | |||
| """) | |||
| """, lexer=None) | |||
| def test(): | |||
| sample_conf = """ | |||
| @@ -29,7 +29,7 @@ class UnexpectedToken(ParseError): | |||
| def is_terminal(sym): | |||
| return isinstance(sym, tuple) or sym.isupper() or sym[0] == '$' | |||
| return isinstance(sym, Terminal) or sym.isupper() or sym[0] == '$' | |||
| class LexerConf: | |||
| @@ -81,3 +81,26 @@ class TokenDef(object): | |||
| def __repr__(self): | |||
| return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) | |||
| class Terminal: | |||
| def __init__(self, data): | |||
| self.data = data | |||
| def __repr__(self): | |||
| return '%r' % self.data | |||
| def __eq__(self, other): | |||
| return isinstance(other, type(self)) and self.data == other.data | |||
| def __hash__(self): | |||
| return hash(self.data) | |||
| class Terminal_Regexp(Terminal): | |||
| def __init__(self, data): | |||
| Terminal.__init__(self, data) | |||
| self.match = re.compile(data).match | |||
| class Terminal_Token(Terminal): | |||
| def match(self, other): | |||
| return self.data == other.type | |||
| @@ -159,6 +159,8 @@ class Lark: | |||
| def lex(self, text): | |||
| if not hasattr(self, 'lexer'): | |||
| self.lexer = self._build_lexer() | |||
| stream = self.lexer.lex(text) | |||
| if self.options.postlex: | |||
| return self.options.postlex.process(stream) | |||
| @@ -67,8 +67,8 @@ TOKENS = { | |||
| '_DOT': r'\.', | |||
| 'RULE': '!?[_?]?[a-z][_a-z0-9]*', | |||
| 'TOKEN': '_?[A-Z][_A-Z0-9]*', | |||
| 'STRING': r'"(\\"|\\\\|[^"])*?"', | |||
| 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/', | |||
| 'STRING': r'"(\\"|\\\\|[^"\n])*?"', | |||
| 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/', | |||
| '_NL': r'(\r?\n)+\s*', | |||
| 'WS': r'[ \t]+', | |||
| 'COMMENT': r'//[^\n]*', | |||
| @@ -377,11 +377,15 @@ class Grammar: | |||
| else: | |||
| options = RuleOptions.new_from(options, create_token=name) | |||
| name = tokens_to_convert[name] | |||
| inner = Token('RULE', name + '_inner') | |||
| new_rule_defs.append((name, T('expansions', [T('expansion', [inner])]), None)) | |||
| name = inner | |||
| for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ): | |||
| for i, sym in enumerate(exp.children): | |||
| if sym in tokens_to_convert: | |||
| exp.children[i] = Token(sym.type, tokens_to_convert[sym]) | |||
| else: | |||
| for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ): | |||
| for i, sym in enumerate(exp.children): | |||
| if sym in tokens_to_convert: | |||
| exp.children[i] = Token(sym.type, tokens_to_convert[sym]) | |||
| new_rule_defs.append((name, tree, options)) | |||
| @@ -3,8 +3,8 @@ import sre_parse | |||
| from .lexer import Lexer, ContextualLexer, Token | |||
| from .common import is_terminal, GrammarError, ParserConf | |||
| from .parsers import lalr_parser, earley, nearley | |||
| from .common import is_terminal, GrammarError, ParserConf, Terminal_Regexp, Terminal_Token | |||
| from .parsers import lalr_parser, old_earley, nearley, earley | |||
| from .tree import Transformer | |||
| class WithLexer: | |||
| @@ -70,13 +70,13 @@ class Nearley(WithLexer): | |||
| return res[0] | |||
| class Earley(WithLexer): | |||
| class OldEarley(WithLexer): | |||
| def __init__(self, lexer_conf, parser_conf): | |||
| WithLexer.__init__(self, lexer_conf) | |||
| rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] | |||
| self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | |||
| self.parser = old_earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | |||
| def _prepare_expansion(self, expansion): | |||
| return [(sym,) if is_terminal(sym) else sym for sym in expansion] | |||
| @@ -100,13 +100,13 @@ def tokenize_text(text): | |||
| return new_text | |||
| class Earley_NoLex: | |||
| class OldEarley_NoLex: | |||
| def __init__(self, lexer_conf, parser_conf): | |||
| self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||
| rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] | |||
| self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | |||
| self.parser = old_earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | |||
| def _prepare_expansion(self, expansion): | |||
| for sym in expansion: | |||
| @@ -125,6 +125,43 @@ class Earley_NoLex: | |||
| assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | |||
| return res[0] | |||
| class Earley_NoLex: | |||
| def __init__(self, lexer_conf, parser_conf): | |||
| self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||
| rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] | |||
| self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback) | |||
| def _prepare_expansion(self, expansion): | |||
| for sym in expansion: | |||
| if is_terminal(sym): | |||
| regexp = self.token_by_name[sym].pattern.to_regexp() | |||
| width = sre_parse.parse(regexp).getwidth() | |||
| if width != (1,1): | |||
| raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) | |||
| yield Terminal_Regexp(regexp) | |||
| else: | |||
| yield sym | |||
| def parse(self, text): | |||
| new_text = tokenize_text(text) | |||
| return self.parser.parse(new_text) | |||
| class Earley(WithLexer): | |||
| def __init__(self, lexer_conf, parser_conf): | |||
| WithLexer.__init__(self, lexer_conf) | |||
| rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] | |||
| self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback) | |||
| def _prepare_expansion(self, expansion): | |||
| return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion] | |||
| def parse(self, text): | |||
| tokens = list(self.lex(text)) | |||
| return self.parser.parse(tokens) | |||
| def get_frontend(parser, lexer): | |||
| if parser=='lalr': | |||
| @@ -1,25 +1,42 @@ | |||
| "This module implements an Earley Parser" | |||
| # The parser uses a parse-forest to keep track of derivations and ambiguations. | |||
| # When the parse ends successfully, a disambiguation stage resolves all ambiguity | |||
| # (right now ambiguity resolution is not developed beyond the needs of lark) | |||
| # Afterwards the parse tree is reduced (transformed) according to user callbacks. | |||
| # I use the no-recursion version of Transformer and Visitor, because the tree might be | |||
| # deeper than Python's recursion limit (a bit absurd, but that's life) | |||
| # | |||
| # The algorithm keeps track of each state set, using a corresponding Column instance. | |||
| # Column keeps track of new items using NewsList instances. | |||
| # | |||
| # Author: Erez Shinan (2017) | |||
| # Email : erezshin@gmail.com | |||
| from ..common import ParseError, UnexpectedToken, is_terminal | |||
| from functools import cmp_to_key | |||
| from ..utils import compare | |||
| from ..common import ParseError, UnexpectedToken, Terminal | |||
| from .grammar_analysis import GrammarAnalyzer | |||
| from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse | |||
| class EndToken: | |||
| type = '$end' | |||
| class Derivation(Tree): | |||
| def __init__(self, rule, items=None): | |||
| Tree.__init__(self, 'drv', items or []) | |||
| self.rule = rule | |||
| END_TOKEN = EndToken() | |||
| class Item(object): | |||
| def __init__(self, rule, ptr, start, data): | |||
| def __init__(self, rule, ptr, start, tree): | |||
| self.rule = rule | |||
| self.ptr = ptr | |||
| self.start = start | |||
| self.data = data | |||
| self.tree = tree if tree is not None else Derivation(self.rule) | |||
| @property | |||
| def expect(self): | |||
| @@ -29,8 +46,10 @@ class Item(object): | |||
| def is_complete(self): | |||
| return self.ptr == len(self.rule.expansion) | |||
| def advance(self, data): | |||
| return Item(self.rule, self.ptr+1, self.start, self.data + [data]) | |||
| def advance(self, tree): | |||
| assert self.tree.data == 'drv' | |||
| new_tree = Derivation(self.rule, self.tree.children + [tree]) | |||
| return Item(self.rule, self.ptr+1, self.start, new_tree) | |||
| def __eq__(self, other): | |||
| return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | |||
| @@ -38,8 +57,8 @@ class Item(object): | |||
| return hash((self.rule, self.ptr, id(self.start))) | |||
| def __repr__(self): | |||
| before = map(str, self.rule.expansion[:self.ptr]) | |||
| after = map(str, self.rule.expansion[self.ptr:]) | |||
| before = list(map(str, self.rule.expansion[:self.ptr])) | |||
| after = list(map(str, self.rule.expansion[self.ptr:])) | |||
| return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) | |||
| @@ -56,15 +75,18 @@ class NewsList(list): | |||
| return self[i:] | |||
| class Column: | |||
| "An entry in the table, aka Earley Chart" | |||
| def __init__(self): | |||
| def __init__(self, i): | |||
| self.i = i | |||
| self.to_reduce = NewsList() | |||
| self.to_predict = NewsList() | |||
| self.to_scan = NewsList() | |||
| self.item_count = 0 | |||
| self.added = set() | |||
| self.completed = {} | |||
| def add(self, items): | |||
| """Sort items into scan/predict/reduce newslists | |||
| @@ -76,29 +98,24 @@ class Column: | |||
| for item in items: | |||
| if item.is_complete: | |||
| # (We must allow repetition of empty rules) | |||
| if item.rule.expansion: | |||
| # This is an important test to avoid infinite-loops, | |||
| # For example for the rule: | |||
| # a: a | "b" | |||
| # If we can detect these cases statically, we can remove | |||
| # this test an gain a tiny performance boost | |||
| # | |||
| if item in added: | |||
| continue | |||
| added.add(item) | |||
| self.to_reduce.append(item) | |||
| else: | |||
| if is_terminal(item.expect): | |||
| self.to_scan.append(item) | |||
| # XXX TODO Potential bug: What happens if there's ambiguity in an empty rule? | |||
| if item.rule.expansion and item in self.completed: | |||
| old_tree = self.completed[item].tree | |||
| if old_tree.data != 'ambig': | |||
| new_tree = old_tree.copy() | |||
| new_tree.rule = old_tree.rule | |||
| old_tree.set('ambig', [new_tree]) | |||
| old_tree.children.append(item.tree) | |||
| else: | |||
| if item in added: | |||
| continue | |||
| self.completed[item] = item | |||
| self.to_reduce.append(item) | |||
| else: | |||
| if item not in added: | |||
| added.add(item) | |||
| self.to_predict.append(item) | |||
| if isinstance(item.expect, Terminal): | |||
| self.to_scan.append(item) | |||
| else: | |||
| self.to_predict.append(item) | |||
| self.item_count += 1 # Only count if actually added | |||
| @@ -106,17 +123,16 @@ class Column: | |||
| return bool(self.item_count) | |||
| class Parser: | |||
| def __init__(self, parser_conf): | |||
| self.analysis = GrammarAnalyzer(parser_conf.rules, parser_conf.start) | |||
| self.start = parser_conf.start | |||
| def __init__(self, rules, start, callback): | |||
| self.analysis = GrammarAnalyzer(rules, start) | |||
| self.start = start | |||
| self.postprocess = {} | |||
| self.predictions = {} | |||
| for rule in self.analysis.rules: | |||
| if rule.origin != '$root': # XXX kinda ugly | |||
| a = rule.alias | |||
| self.postprocess[rule] = a if callable(a) else getattr(parser_conf.callback, a) | |||
| self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) | |||
| self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||
| def parse(self, stream, start=None): | |||
| @@ -124,16 +140,15 @@ class Parser: | |||
| start = start or self.start | |||
| def predict(nonterm, i): | |||
| assert not is_terminal(nonterm), nonterm | |||
| return [Item(rule, 0, i, []) for rule in self.predictions[nonterm]] | |||
| assert not isinstance(nonterm, Terminal), nonterm | |||
| return [Item(rule, 0, i, None) for rule in self.predictions[nonterm]] | |||
| def complete(item): | |||
| name = item.rule.origin | |||
| item.data = self.postprocess[item.rule](item.data) | |||
| return [i.advance(item.data) for i in item.start.to_predict if i.expect == name] | |||
| return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name] | |||
| def process_column(i, token, cur_set): | |||
| next_set = Column() | |||
| next_set = Column(i) | |||
| while True: | |||
| to_predict = {x.expect for x in cur_set.to_predict.get_news() | |||
| @@ -147,21 +162,20 @@ class Parser: | |||
| for item in to_reduce: | |||
| cur_set.add( complete(item) ) | |||
| if token is not END_TOKEN: | |||
| for item in cur_set.to_scan.get_news(): | |||
| match = item.expect[0](token) if callable(item.expect[0]) else item.expect[0] == token.type | |||
| if match: | |||
| to_scan = cur_set.to_scan.get_news() | |||
| for item in to_scan: | |||
| if item.expect.match(token): | |||
| next_set.add([item.advance(stream[i])]) | |||
| if not next_set and token is not END_TOKEN: | |||
| expect = {i.expect[-1] for i in cur_set.to_scan} | |||
| expect = {i.expect for i in cur_set.to_scan} | |||
| raise UnexpectedToken(token, expect, stream, i) | |||
| return cur_set, next_set | |||
| # Main loop starts | |||
| column0 = Column() | |||
| column0 = Column(0) | |||
| column0.add(predict(start, column0)) | |||
| cur_set = column0 | |||
| @@ -171,10 +185,83 @@ class Parser: | |||
| last_set, _ = process_column(len(stream), END_TOKEN, cur_set) | |||
| # Parse ended. Now build a parse tree | |||
| solutions = [n.data for n in last_set.to_reduce | |||
| solutions = [n.tree for n in last_set.to_reduce | |||
| if n.rule.origin==start and n.start is column0] | |||
| if not solutions: | |||
| raise ParseError('Incomplete parse: Could not find a solution to input') | |||
| return solutions | |||
| elif len(solutions) == 1: | |||
| tree = solutions[0] | |||
| else: | |||
| tree = Tree('ambig', solutions) | |||
| ResolveAmbig().visit(tree) | |||
| return ApplyCallbacks(self.postprocess).transform(tree) | |||
| class ApplyCallbacks(Transformer_NoRecurse): | |||
| def __init__(self, postprocess): | |||
| self.postprocess = postprocess | |||
| def drv(self, tree): | |||
| children = tree.children | |||
| callback = self.postprocess[tree.rule] | |||
| if callback: | |||
| return callback(children) | |||
| else: | |||
| return Tree(rule.origin, children) | |||
| def _compare_rules(rule1, rule2): | |||
| assert rule1.origin == rule2.origin | |||
| c = compare( len(rule1.expansion), len(rule2.expansion)) | |||
| if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here | |||
| return c | |||
| else: | |||
| return -c | |||
| def _compare_drv(tree1, tree2): | |||
| if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): | |||
| return compare(tree1, tree2) | |||
| c = _compare_rules(tree1.rule, tree2.rule) | |||
| if c: | |||
| return c | |||
| # rules are "equal", so compare trees | |||
| for t1, t2 in zip(tree1.children, tree2.children): | |||
| c = _compare_drv(t1, t2) | |||
| if c: | |||
| return c | |||
| return compare(len(tree1.children), len(tree2.children)) | |||
| class ResolveAmbig(Visitor_NoRecurse): | |||
| def ambig(self, tree): | |||
| best = max(tree.children, key=cmp_to_key(_compare_drv)) | |||
| assert best.data == 'drv' | |||
| tree.set('drv', best.children) | |||
| tree.rule = best.rule # needed for applying callbacks | |||
| # RULES = [ | |||
| # ('a', ['d']), | |||
| # ('d', ['b']), | |||
| # ('b', ['C']), | |||
| # ('b', ['b', 'C']), | |||
| # ('b', ['C', 'b']), | |||
| # ] | |||
| # p = Parser(RULES, 'a') | |||
| # for x in p.parse('CC'): | |||
| # print x.pretty() | |||
| #--------------- | |||
| # RULES = [ | |||
| # ('s', ['a', 'a']), | |||
| # ('a', ['b', 'b']), | |||
| # ('b', ['C'], lambda (x,): x), | |||
| # ('b', ['b', 'C']), | |||
| # ] | |||
| # p = Parser(RULES, 's', {}) | |||
| # print p.parse('CCCCC').pretty() | |||
| @@ -13,7 +13,7 @@ class Rule(object): | |||
| self.alias = alias | |||
| def __repr__(self): | |||
| return '<%s : %s>' % (self.origin, ' '.join(map(unicode,self.expansion))) | |||
| return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) | |||
| class RulePtr(object): | |||
| def __init__(self, rule, index): | |||
| @@ -0,0 +1,180 @@ | |||
| "This module implements an Earley Parser" | |||
| # The algorithm keeps track of each state set, using a corresponding Column instance. | |||
| # Column keeps track of new items using NewsList instances. | |||
| # | |||
| # Author: Erez Shinan (2017) | |||
| # Email : erezshin@gmail.com | |||
| from ..common import ParseError, UnexpectedToken, is_terminal | |||
| from .grammar_analysis import GrammarAnalyzer | |||
| class EndToken: | |||
| type = '$end' | |||
| END_TOKEN = EndToken() | |||
| class Item(object): | |||
| def __init__(self, rule, ptr, start, data): | |||
| self.rule = rule | |||
| self.ptr = ptr | |||
| self.start = start | |||
| self.data = data | |||
| @property | |||
| def expect(self): | |||
| return self.rule.expansion[self.ptr] | |||
| @property | |||
| def is_complete(self): | |||
| return self.ptr == len(self.rule.expansion) | |||
| def advance(self, data): | |||
| return Item(self.rule, self.ptr+1, self.start, self.data + [data]) | |||
| def __eq__(self, other): | |||
| return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | |||
| def __hash__(self): | |||
| return hash((self.rule, self.ptr, id(self.start))) | |||
| def __repr__(self): | |||
| before = map(str, self.rule.expansion[:self.ptr]) | |||
| after = map(str, self.rule.expansion[self.ptr:]) | |||
| return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) | |||
| class NewsList(list): | |||
| "Keeps track of newly added items (append-only)" | |||
| def __init__(self, initial=None): | |||
| list.__init__(self, initial or []) | |||
| self.last_iter = 0 | |||
| def get_news(self): | |||
| i = self.last_iter | |||
| self.last_iter = len(self) | |||
| return self[i:] | |||
| class Column: | |||
| "An entry in the table, aka Earley Chart" | |||
| def __init__(self): | |||
| self.to_reduce = NewsList() | |||
| self.to_predict = NewsList() | |||
| self.to_scan = NewsList() | |||
| self.item_count = 0 | |||
| self.added = set() | |||
| def add(self, items): | |||
| """Sort items into scan/predict/reduce newslists | |||
| Makes sure only unique items are added. | |||
| """ | |||
| added = self.added | |||
| for item in items: | |||
| if item.is_complete: | |||
| # (We must allow repetition of empty rules) | |||
| # if item.rule.expansion: | |||
| # This is an important test to avoid infinite-loops, | |||
| # For example for the rule: | |||
| # a: a | "b" | |||
| # If we can detect these cases statically, we can remove | |||
| # this test an gain a tiny performance boost | |||
| # | |||
| # if item in added: | |||
| # continue | |||
| # added.add(item) | |||
| self.to_reduce.append(item) | |||
| else: | |||
| if is_terminal(item.expect): | |||
| self.to_scan.append(item) | |||
| else: | |||
| if item in added: | |||
| continue | |||
| added.add(item) | |||
| self.to_predict.append(item) | |||
| self.item_count += 1 # Only count if actually added | |||
| def __nonzero__(self): | |||
| return bool(self.item_count) | |||
| class Parser: | |||
| def __init__(self, parser_conf): | |||
| self.analysis = GrammarAnalyzer(parser_conf.rules, parser_conf.start) | |||
| self.start = parser_conf.start | |||
| self.postprocess = {} | |||
| self.predictions = {} | |||
| for rule in self.analysis.rules: | |||
| if rule.origin != '$root': # XXX kinda ugly | |||
| a = rule.alias | |||
| self.postprocess[rule] = a if callable(a) else getattr(parser_conf.callback, a) | |||
| self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||
| def parse(self, stream, start=None): | |||
| # Define parser functions | |||
| start = start or self.start | |||
| def predict(nonterm, i): | |||
| assert not is_terminal(nonterm), nonterm | |||
| return [Item(rule, 0, i, []) for rule in self.predictions[nonterm]] | |||
| def complete(item): | |||
| name = item.rule.origin | |||
| item.data = self.postprocess[item.rule](item.data) | |||
| return [i.advance(item.data) for i in item.start.to_predict if i.expect == name] | |||
| def process_column(i, token, cur_set): | |||
| next_set = Column() | |||
| while True: | |||
| to_predict = {x.expect for x in cur_set.to_predict.get_news() | |||
| if x.ptr} # if not part of an already predicted batch | |||
| to_reduce = cur_set.to_reduce.get_news() | |||
| if not (to_predict or to_reduce): | |||
| break | |||
| for nonterm in to_predict: | |||
| cur_set.add( predict(nonterm, cur_set) ) | |||
| for item in to_reduce: | |||
| cur_set.add( complete(item) ) | |||
| if token is not END_TOKEN: | |||
| for item in cur_set.to_scan.get_news(): | |||
| match = item.expect[0](token) if callable(item.expect[0]) else item.expect[0] == token.type | |||
| if match: | |||
| next_set.add([item.advance(stream[i])]) | |||
| if not next_set and token is not END_TOKEN: | |||
| expect = {i.expect[-1] for i in cur_set.to_scan} | |||
| raise UnexpectedToken(token, expect, stream, i) | |||
| return cur_set, next_set | |||
| # Main loop starts | |||
| column0 = Column() | |||
| column0.add(predict(start, column0)) | |||
| cur_set = column0 | |||
| for i, char in enumerate(stream): | |||
| _, cur_set = process_column(i, char, cur_set) | |||
| last_set, _ = process_column(len(stream), END_TOKEN, cur_set) | |||
| # Parse ended. Now build a parse tree | |||
| solutions = [n.data for n in last_set.to_reduce | |||
| if n.rule.origin==start and n.start is column0] | |||
| if not solutions: | |||
| raise ParseError('Incomplete parse: Could not find a solution to input') | |||
| return solutions | |||
| @@ -2,7 +2,7 @@ import re | |||
| from collections import defaultdict | |||
| from .tree import Tree | |||
| from .common import is_terminal, ParserConf, PatternStr | |||
| from .common import is_terminal, ParserConf, PatternStr, Terminal | |||
| from .lexer import Token | |||
| from .parsers import earley | |||
| @@ -26,21 +26,14 @@ class Reconstructor: | |||
| token_res = {t.name:re.compile(t.pattern.to_regexp()) for t in _tokens} | |||
| class MatchData(object): | |||
| def __init__(self, data): | |||
| self.data = data | |||
| def __repr__(self): | |||
| return '%s(%r)' % (type(self).__name__, self.data) | |||
| class MatchTerminal(MatchData): | |||
| def __call__(self, other): | |||
| class MatchTerminal(Terminal): | |||
| def match(self, other): | |||
| if isinstance(other, Tree): | |||
| return False | |||
| return token_res[self.data].match(other) is not None | |||
| class MatchTree(MatchData): | |||
| def __call__(self, other): | |||
| class MatchTree(Terminal): | |||
| def match(self, other): | |||
| try: | |||
| return self.data == other.data | |||
| except AttributeError: | |||
| @@ -90,7 +83,7 @@ class Reconstructor: | |||
| for name, expansions in d.items(): | |||
| for expansion in expansions: | |||
| reduced = [sym if sym.startswith('_') or sym in expand1s else | |||
| (MatchTerminal(sym) if is_terminal(sym) else MatchTree(sym),) | |||
| MatchTerminal(sym) if is_terminal(sym) else MatchTree(sym) | |||
| for sym in expansion if not is_discarded_terminal(sym)] | |||
| rules.append((name, reduced, WriteTokens(name, expansion).f)) | |||
| @@ -98,9 +91,9 @@ class Reconstructor: | |||
| def _reconstruct(self, tree): | |||
| parser = earley.Parser(ParserConf(self.rules, {}, tree.data)) | |||
| res ,= parser.parse(tree.children) # XXX ambiguity? | |||
| # TODO: ambiguity? | |||
| parser = earley.Parser(self.rules, tree.data, {}) | |||
| res = parser.parse(tree.children) | |||
| for item in res: | |||
| if isinstance(item, Tree): | |||
| for x in self._reconstruct(item): | |||
| @@ -32,7 +32,11 @@ class Tree(object): | |||
| self.children[i:i+1] = kid.children | |||
| def __eq__(self, other): | |||
| return self.data == other.data and self.children == other.children | |||
| try: | |||
| return self.data == other.data and self.children == other.children | |||
| except AttributeError: | |||
| return False | |||
| def __hash__(self): | |||
| return hash((self.data, tuple(self.children))) | |||
| @@ -57,10 +61,24 @@ class Tree(object): | |||
| if pred(c): | |||
| yield c | |||
| def iter_subtrees(self): | |||
| q = [self] | |||
| while q: | |||
| subtree = q.pop() | |||
| yield subtree | |||
| q += [c for c in subtree.children if isinstance(c, Tree)] | |||
| def __deepcopy__(self, memo): | |||
| return type(self)(self.data, deepcopy(self.children, memo)) | |||
| def copy(self): | |||
| return type(self)(self.data, self.children) | |||
| def set(self, data, children): | |||
| self.data = data | |||
| self.children = children | |||
| class Transformer(object): | |||
| @@ -81,7 +99,7 @@ class Transformer(object): | |||
| class InlineTransformer(Transformer): | |||
| def _get_func(self, name): | |||
| def _get_func(self, name): # use super()._get_func | |||
| return inline_args(getattr(self, name)).__get__(self) | |||
| @@ -97,3 +115,35 @@ class Visitor(object): | |||
| def __default__(self, tree): | |||
| pass | |||
| class Visitor_NoRecurse(Visitor): | |||
| def visit(self, tree): | |||
| subtrees = list(tree.iter_subtrees()) | |||
| for subtree in reversed(subtrees): | |||
| getattr(self, subtree.data, self.__default__)(subtree) | |||
| return tree | |||
| class Transformer_NoRecurse(Transformer): | |||
| def transform(self, tree): | |||
| subtrees = list(tree.iter_subtrees()) | |||
| def _t(t): | |||
| # Assumes t is already transformed | |||
| try: | |||
| f = self._get_func(t.data) | |||
| except AttributeError: | |||
| return self.__default__(t) | |||
| else: | |||
| return f(t) | |||
| for subtree in reversed(subtrees): | |||
| subtree.children = [_t(c) if isinstance(c, Tree) else c for c in subtree.children] | |||
| return _t(tree) | |||
| def __default__(self, t): | |||
| return t | |||
| @@ -69,3 +69,14 @@ def inline_args(f): | |||
| return f.__func__(self, *args) | |||
| return _f | |||
| try: | |||
| compare = cmp | |||
| except NameError: | |||
| def compare(a, b): | |||
| if a == b: | |||
| return 0 | |||
| elif a > b: | |||
| return 1 | |||
| else: | |||
| return -1 | |||
| @@ -73,6 +73,28 @@ class TestEarley(unittest.TestCase): | |||
| l = Lark(grammar, parser='earley', lexer=None) | |||
| l.parse(program) | |||
| def test_earley_scanless3(self): | |||
| "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" | |||
| grammar = """ | |||
| start: A A | |||
| A: "a"+ | |||
| """ | |||
| l = Lark(grammar, parser='earley', lexer=None) | |||
| res = l.parse("aaa") | |||
| self.assertEqual(res.children, ['aa', 'a']) | |||
| def test_earley_scanless4(self): | |||
| grammar = """ | |||
| start: A A? | |||
| A: "a"+ | |||
| """ | |||
| l = Lark(grammar, parser='earley', lexer=None) | |||
| res = l.parse("aaa") | |||
| self.assertEqual(res.children, ['aaa']) | |||
| def _make_parser_test(LEXER, PARSER): | |||
| def _Lark(grammar, **kwargs): | |||
| return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs) | |||