It now knows how to resolve ambiguity! And in a memory-efficient way!tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
| @@ -134,7 +134,7 @@ These features may be implemented some day: | |||||
| - You can work with parse-trees instead of state-machines | - You can work with parse-trees instead of state-machines | ||||
| - The grammar is simple to read and write | - The grammar is simple to read and write | ||||
| - There are no restrictions on grammar structure. Any grammar you write can be parsed. | - There are no restrictions on grammar structure. Any grammar you write can be parsed. | ||||
| - Some structures are faster than others. If you care about speed, you can learn them gradually while the parser is already working. | |||||
| - Some structures are faster than others. If you care about speed, you can learn them gradually while the parser is already working | |||||
| - A well-written grammar is very fast | - A well-written grammar is very fast | ||||
| - Note: Nondeterminstic grammars will run a little slower | - Note: Nondeterminstic grammars will run a little slower | ||||
| - Note: Ambiguous grammars (grammars that can be parsed in more than one way) are supported, but may cause significant slowdown if the ambiguity is too big) | - Note: Ambiguous grammars (grammars that can be parsed in more than one way) are supported, but may cause significant slowdown if the ambiguity is too big) | ||||
| @@ -25,7 +25,7 @@ parser = Lark(r""" | |||||
| %import common.WS_INLINE | %import common.WS_INLINE | ||||
| %ignore WS_INLINE | %ignore WS_INLINE | ||||
| """) | |||||
| """, lexer=None) | |||||
| def test(): | def test(): | ||||
| sample_conf = """ | sample_conf = """ | ||||
| @@ -29,7 +29,7 @@ class UnexpectedToken(ParseError): | |||||
| def is_terminal(sym): | def is_terminal(sym): | ||||
| return isinstance(sym, tuple) or sym.isupper() or sym[0] == '$' | |||||
| return isinstance(sym, Terminal) or sym.isupper() or sym[0] == '$' | |||||
| class LexerConf: | class LexerConf: | ||||
| @@ -81,3 +81,26 @@ class TokenDef(object): | |||||
| def __repr__(self): | def __repr__(self): | ||||
| return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) | return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) | ||||
| class Terminal: | |||||
| def __init__(self, data): | |||||
| self.data = data | |||||
| def __repr__(self): | |||||
| return '%r' % self.data | |||||
| def __eq__(self, other): | |||||
| return isinstance(other, type(self)) and self.data == other.data | |||||
| def __hash__(self): | |||||
| return hash(self.data) | |||||
| class Terminal_Regexp(Terminal): | |||||
| def __init__(self, data): | |||||
| Terminal.__init__(self, data) | |||||
| self.match = re.compile(data).match | |||||
| class Terminal_Token(Terminal): | |||||
| def match(self, other): | |||||
| return self.data == other.type | |||||
| @@ -159,6 +159,8 @@ class Lark: | |||||
| def lex(self, text): | def lex(self, text): | ||||
| if not hasattr(self, 'lexer'): | |||||
| self.lexer = self._build_lexer() | |||||
| stream = self.lexer.lex(text) | stream = self.lexer.lex(text) | ||||
| if self.options.postlex: | if self.options.postlex: | ||||
| return self.options.postlex.process(stream) | return self.options.postlex.process(stream) | ||||
| @@ -67,8 +67,8 @@ TOKENS = { | |||||
| '_DOT': r'\.', | '_DOT': r'\.', | ||||
| 'RULE': '!?[_?]?[a-z][_a-z0-9]*', | 'RULE': '!?[_?]?[a-z][_a-z0-9]*', | ||||
| 'TOKEN': '_?[A-Z][_A-Z0-9]*', | 'TOKEN': '_?[A-Z][_A-Z0-9]*', | ||||
| 'STRING': r'"(\\"|\\\\|[^"])*?"', | |||||
| 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/', | |||||
| 'STRING': r'"(\\"|\\\\|[^"\n])*?"', | |||||
| 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/', | |||||
| '_NL': r'(\r?\n)+\s*', | '_NL': r'(\r?\n)+\s*', | ||||
| 'WS': r'[ \t]+', | 'WS': r'[ \t]+', | ||||
| 'COMMENT': r'//[^\n]*', | 'COMMENT': r'//[^\n]*', | ||||
| @@ -377,11 +377,15 @@ class Grammar: | |||||
| else: | else: | ||||
| options = RuleOptions.new_from(options, create_token=name) | options = RuleOptions.new_from(options, create_token=name) | ||||
| name = tokens_to_convert[name] | name = tokens_to_convert[name] | ||||
| inner = Token('RULE', name + '_inner') | |||||
| new_rule_defs.append((name, T('expansions', [T('expansion', [inner])]), None)) | |||||
| name = inner | |||||
| for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ): | |||||
| for i, sym in enumerate(exp.children): | |||||
| if sym in tokens_to_convert: | |||||
| exp.children[i] = Token(sym.type, tokens_to_convert[sym]) | |||||
| else: | |||||
| for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ): | |||||
| for i, sym in enumerate(exp.children): | |||||
| if sym in tokens_to_convert: | |||||
| exp.children[i] = Token(sym.type, tokens_to_convert[sym]) | |||||
| new_rule_defs.append((name, tree, options)) | new_rule_defs.append((name, tree, options)) | ||||
| @@ -3,8 +3,8 @@ import sre_parse | |||||
| from .lexer import Lexer, ContextualLexer, Token | from .lexer import Lexer, ContextualLexer, Token | ||||
| from .common import is_terminal, GrammarError, ParserConf | |||||
| from .parsers import lalr_parser, earley, nearley | |||||
| from .common import is_terminal, GrammarError, ParserConf, Terminal_Regexp, Terminal_Token | |||||
| from .parsers import lalr_parser, old_earley, nearley, earley | |||||
| from .tree import Transformer | from .tree import Transformer | ||||
| class WithLexer: | class WithLexer: | ||||
| @@ -70,13 +70,13 @@ class Nearley(WithLexer): | |||||
| return res[0] | return res[0] | ||||
| class Earley(WithLexer): | |||||
| class OldEarley(WithLexer): | |||||
| def __init__(self, lexer_conf, parser_conf): | def __init__(self, lexer_conf, parser_conf): | ||||
| WithLexer.__init__(self, lexer_conf) | WithLexer.__init__(self, lexer_conf) | ||||
| rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] | rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] | ||||
| self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | |||||
| self.parser = old_earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | |||||
| def _prepare_expansion(self, expansion): | def _prepare_expansion(self, expansion): | ||||
| return [(sym,) if is_terminal(sym) else sym for sym in expansion] | return [(sym,) if is_terminal(sym) else sym for sym in expansion] | ||||
| @@ -100,13 +100,13 @@ def tokenize_text(text): | |||||
| return new_text | return new_text | ||||
| class Earley_NoLex: | |||||
| class OldEarley_NoLex: | |||||
| def __init__(self, lexer_conf, parser_conf): | def __init__(self, lexer_conf, parser_conf): | ||||
| self.token_by_name = {t.name:t for t in lexer_conf.tokens} | self.token_by_name = {t.name:t for t in lexer_conf.tokens} | ||||
| rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] | rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] | ||||
| self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | |||||
| self.parser = old_earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | |||||
| def _prepare_expansion(self, expansion): | def _prepare_expansion(self, expansion): | ||||
| for sym in expansion: | for sym in expansion: | ||||
| @@ -125,6 +125,43 @@ class Earley_NoLex: | |||||
| assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | ||||
| return res[0] | return res[0] | ||||
| class Earley_NoLex: | |||||
| def __init__(self, lexer_conf, parser_conf): | |||||
| self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||||
| rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] | |||||
| self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback) | |||||
| def _prepare_expansion(self, expansion): | |||||
| for sym in expansion: | |||||
| if is_terminal(sym): | |||||
| regexp = self.token_by_name[sym].pattern.to_regexp() | |||||
| width = sre_parse.parse(regexp).getwidth() | |||||
| if width != (1,1): | |||||
| raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) | |||||
| yield Terminal_Regexp(regexp) | |||||
| else: | |||||
| yield sym | |||||
| def parse(self, text): | |||||
| new_text = tokenize_text(text) | |||||
| return self.parser.parse(new_text) | |||||
| class Earley(WithLexer): | |||||
| def __init__(self, lexer_conf, parser_conf): | |||||
| WithLexer.__init__(self, lexer_conf) | |||||
| rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] | |||||
| self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback) | |||||
| def _prepare_expansion(self, expansion): | |||||
| return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion] | |||||
| def parse(self, text): | |||||
| tokens = list(self.lex(text)) | |||||
| return self.parser.parse(tokens) | |||||
| def get_frontend(parser, lexer): | def get_frontend(parser, lexer): | ||||
| if parser=='lalr': | if parser=='lalr': | ||||
| @@ -1,25 +1,42 @@ | |||||
| "This module implements an Earley Parser" | "This module implements an Earley Parser" | ||||
| # The parser uses a parse-forest to keep track of derivations and ambiguations. | |||||
| # When the parse ends successfully, a disambiguation stage resolves all ambiguity | |||||
| # (right now ambiguity resolution is not developed beyond the needs of lark) | |||||
| # Afterwards the parse tree is reduced (transformed) according to user callbacks. | |||||
| # I use the no-recursion version of Transformer and Visitor, because the tree might be | |||||
| # deeper than Python's recursion limit (a bit absurd, but that's life) | |||||
| # | |||||
| # The algorithm keeps track of each state set, using a corresponding Column instance. | # The algorithm keeps track of each state set, using a corresponding Column instance. | ||||
| # Column keeps track of new items using NewsList instances. | # Column keeps track of new items using NewsList instances. | ||||
| # | # | ||||
| # Author: Erez Shinan (2017) | # Author: Erez Shinan (2017) | ||||
| # Email : erezshin@gmail.com | # Email : erezshin@gmail.com | ||||
| from ..common import ParseError, UnexpectedToken, is_terminal | |||||
| from functools import cmp_to_key | |||||
| from ..utils import compare | |||||
| from ..common import ParseError, UnexpectedToken, Terminal | |||||
| from .grammar_analysis import GrammarAnalyzer | from .grammar_analysis import GrammarAnalyzer | ||||
| from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse | |||||
| class EndToken: | class EndToken: | ||||
| type = '$end' | type = '$end' | ||||
| class Derivation(Tree): | |||||
| def __init__(self, rule, items=None): | |||||
| Tree.__init__(self, 'drv', items or []) | |||||
| self.rule = rule | |||||
| END_TOKEN = EndToken() | END_TOKEN = EndToken() | ||||
| class Item(object): | class Item(object): | ||||
| def __init__(self, rule, ptr, start, data): | |||||
| def __init__(self, rule, ptr, start, tree): | |||||
| self.rule = rule | self.rule = rule | ||||
| self.ptr = ptr | self.ptr = ptr | ||||
| self.start = start | self.start = start | ||||
| self.data = data | |||||
| self.tree = tree if tree is not None else Derivation(self.rule) | |||||
| @property | @property | ||||
| def expect(self): | def expect(self): | ||||
| @@ -29,8 +46,10 @@ class Item(object): | |||||
| def is_complete(self): | def is_complete(self): | ||||
| return self.ptr == len(self.rule.expansion) | return self.ptr == len(self.rule.expansion) | ||||
| def advance(self, data): | |||||
| return Item(self.rule, self.ptr+1, self.start, self.data + [data]) | |||||
| def advance(self, tree): | |||||
| assert self.tree.data == 'drv' | |||||
| new_tree = Derivation(self.rule, self.tree.children + [tree]) | |||||
| return Item(self.rule, self.ptr+1, self.start, new_tree) | |||||
| def __eq__(self, other): | def __eq__(self, other): | ||||
| return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | ||||
| @@ -38,8 +57,8 @@ class Item(object): | |||||
| return hash((self.rule, self.ptr, id(self.start))) | return hash((self.rule, self.ptr, id(self.start))) | ||||
| def __repr__(self): | def __repr__(self): | ||||
| before = map(str, self.rule.expansion[:self.ptr]) | |||||
| after = map(str, self.rule.expansion[self.ptr:]) | |||||
| before = list(map(str, self.rule.expansion[:self.ptr])) | |||||
| after = list(map(str, self.rule.expansion[self.ptr:])) | |||||
| return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) | return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) | ||||
| @@ -56,15 +75,18 @@ class NewsList(list): | |||||
| return self[i:] | return self[i:] | ||||
| class Column: | class Column: | ||||
| "An entry in the table, aka Earley Chart" | "An entry in the table, aka Earley Chart" | ||||
| def __init__(self): | |||||
| def __init__(self, i): | |||||
| self.i = i | |||||
| self.to_reduce = NewsList() | self.to_reduce = NewsList() | ||||
| self.to_predict = NewsList() | self.to_predict = NewsList() | ||||
| self.to_scan = NewsList() | self.to_scan = NewsList() | ||||
| self.item_count = 0 | self.item_count = 0 | ||||
| self.added = set() | self.added = set() | ||||
| self.completed = {} | |||||
| def add(self, items): | def add(self, items): | ||||
| """Sort items into scan/predict/reduce newslists | """Sort items into scan/predict/reduce newslists | ||||
| @@ -76,29 +98,24 @@ class Column: | |||||
| for item in items: | for item in items: | ||||
| if item.is_complete: | if item.is_complete: | ||||
| # (We must allow repetition of empty rules) | |||||
| if item.rule.expansion: | |||||
| # This is an important test to avoid infinite-loops, | |||||
| # For example for the rule: | |||||
| # a: a | "b" | |||||
| # If we can detect these cases statically, we can remove | |||||
| # this test an gain a tiny performance boost | |||||
| # | |||||
| if item in added: | |||||
| continue | |||||
| added.add(item) | |||||
| self.to_reduce.append(item) | |||||
| else: | |||||
| if is_terminal(item.expect): | |||||
| self.to_scan.append(item) | |||||
| # XXX TODO Potential bug: What happens if there's ambiguity in an empty rule? | |||||
| if item.rule.expansion and item in self.completed: | |||||
| old_tree = self.completed[item].tree | |||||
| if old_tree.data != 'ambig': | |||||
| new_tree = old_tree.copy() | |||||
| new_tree.rule = old_tree.rule | |||||
| old_tree.set('ambig', [new_tree]) | |||||
| old_tree.children.append(item.tree) | |||||
| else: | else: | ||||
| if item in added: | |||||
| continue | |||||
| self.completed[item] = item | |||||
| self.to_reduce.append(item) | |||||
| else: | |||||
| if item not in added: | |||||
| added.add(item) | added.add(item) | ||||
| self.to_predict.append(item) | |||||
| if isinstance(item.expect, Terminal): | |||||
| self.to_scan.append(item) | |||||
| else: | |||||
| self.to_predict.append(item) | |||||
| self.item_count += 1 # Only count if actually added | self.item_count += 1 # Only count if actually added | ||||
| @@ -106,17 +123,16 @@ class Column: | |||||
| return bool(self.item_count) | return bool(self.item_count) | ||||
| class Parser: | class Parser: | ||||
| def __init__(self, parser_conf): | |||||
| self.analysis = GrammarAnalyzer(parser_conf.rules, parser_conf.start) | |||||
| self.start = parser_conf.start | |||||
| def __init__(self, rules, start, callback): | |||||
| self.analysis = GrammarAnalyzer(rules, start) | |||||
| self.start = start | |||||
| self.postprocess = {} | self.postprocess = {} | ||||
| self.predictions = {} | self.predictions = {} | ||||
| for rule in self.analysis.rules: | for rule in self.analysis.rules: | ||||
| if rule.origin != '$root': # XXX kinda ugly | if rule.origin != '$root': # XXX kinda ugly | ||||
| a = rule.alias | a = rule.alias | ||||
| self.postprocess[rule] = a if callable(a) else getattr(parser_conf.callback, a) | |||||
| self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) | |||||
| self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | ||||
| def parse(self, stream, start=None): | def parse(self, stream, start=None): | ||||
| @@ -124,16 +140,15 @@ class Parser: | |||||
| start = start or self.start | start = start or self.start | ||||
| def predict(nonterm, i): | def predict(nonterm, i): | ||||
| assert not is_terminal(nonterm), nonterm | |||||
| return [Item(rule, 0, i, []) for rule in self.predictions[nonterm]] | |||||
| assert not isinstance(nonterm, Terminal), nonterm | |||||
| return [Item(rule, 0, i, None) for rule in self.predictions[nonterm]] | |||||
| def complete(item): | def complete(item): | ||||
| name = item.rule.origin | name = item.rule.origin | ||||
| item.data = self.postprocess[item.rule](item.data) | |||||
| return [i.advance(item.data) for i in item.start.to_predict if i.expect == name] | |||||
| return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name] | |||||
| def process_column(i, token, cur_set): | def process_column(i, token, cur_set): | ||||
| next_set = Column() | |||||
| next_set = Column(i) | |||||
| while True: | while True: | ||||
| to_predict = {x.expect for x in cur_set.to_predict.get_news() | to_predict = {x.expect for x in cur_set.to_predict.get_news() | ||||
| @@ -147,21 +162,20 @@ class Parser: | |||||
| for item in to_reduce: | for item in to_reduce: | ||||
| cur_set.add( complete(item) ) | cur_set.add( complete(item) ) | ||||
| if token is not END_TOKEN: | if token is not END_TOKEN: | ||||
| for item in cur_set.to_scan.get_news(): | |||||
| match = item.expect[0](token) if callable(item.expect[0]) else item.expect[0] == token.type | |||||
| if match: | |||||
| to_scan = cur_set.to_scan.get_news() | |||||
| for item in to_scan: | |||||
| if item.expect.match(token): | |||||
| next_set.add([item.advance(stream[i])]) | next_set.add([item.advance(stream[i])]) | ||||
| if not next_set and token is not END_TOKEN: | if not next_set and token is not END_TOKEN: | ||||
| expect = {i.expect[-1] for i in cur_set.to_scan} | |||||
| expect = {i.expect for i in cur_set.to_scan} | |||||
| raise UnexpectedToken(token, expect, stream, i) | raise UnexpectedToken(token, expect, stream, i) | ||||
| return cur_set, next_set | return cur_set, next_set | ||||
| # Main loop starts | # Main loop starts | ||||
| column0 = Column() | |||||
| column0 = Column(0) | |||||
| column0.add(predict(start, column0)) | column0.add(predict(start, column0)) | ||||
| cur_set = column0 | cur_set = column0 | ||||
| @@ -171,10 +185,83 @@ class Parser: | |||||
| last_set, _ = process_column(len(stream), END_TOKEN, cur_set) | last_set, _ = process_column(len(stream), END_TOKEN, cur_set) | ||||
| # Parse ended. Now build a parse tree | # Parse ended. Now build a parse tree | ||||
| solutions = [n.data for n in last_set.to_reduce | |||||
| solutions = [n.tree for n in last_set.to_reduce | |||||
| if n.rule.origin==start and n.start is column0] | if n.rule.origin==start and n.start is column0] | ||||
| if not solutions: | if not solutions: | ||||
| raise ParseError('Incomplete parse: Could not find a solution to input') | raise ParseError('Incomplete parse: Could not find a solution to input') | ||||
| return solutions | |||||
| elif len(solutions) == 1: | |||||
| tree = solutions[0] | |||||
| else: | |||||
| tree = Tree('ambig', solutions) | |||||
| ResolveAmbig().visit(tree) | |||||
| return ApplyCallbacks(self.postprocess).transform(tree) | |||||
| class ApplyCallbacks(Transformer_NoRecurse): | |||||
| def __init__(self, postprocess): | |||||
| self.postprocess = postprocess | |||||
| def drv(self, tree): | |||||
| children = tree.children | |||||
| callback = self.postprocess[tree.rule] | |||||
| if callback: | |||||
| return callback(children) | |||||
| else: | |||||
| return Tree(rule.origin, children) | |||||
| def _compare_rules(rule1, rule2): | |||||
| assert rule1.origin == rule2.origin | |||||
| c = compare( len(rule1.expansion), len(rule2.expansion)) | |||||
| if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here | |||||
| return c | |||||
| else: | |||||
| return -c | |||||
| def _compare_drv(tree1, tree2): | |||||
| if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): | |||||
| return compare(tree1, tree2) | |||||
| c = _compare_rules(tree1.rule, tree2.rule) | |||||
| if c: | |||||
| return c | |||||
| # rules are "equal", so compare trees | |||||
| for t1, t2 in zip(tree1.children, tree2.children): | |||||
| c = _compare_drv(t1, t2) | |||||
| if c: | |||||
| return c | |||||
| return compare(len(tree1.children), len(tree2.children)) | |||||
| class ResolveAmbig(Visitor_NoRecurse): | |||||
| def ambig(self, tree): | |||||
| best = max(tree.children, key=cmp_to_key(_compare_drv)) | |||||
| assert best.data == 'drv' | |||||
| tree.set('drv', best.children) | |||||
| tree.rule = best.rule # needed for applying callbacks | |||||
| # RULES = [ | |||||
| # ('a', ['d']), | |||||
| # ('d', ['b']), | |||||
| # ('b', ['C']), | |||||
| # ('b', ['b', 'C']), | |||||
| # ('b', ['C', 'b']), | |||||
| # ] | |||||
| # p = Parser(RULES, 'a') | |||||
| # for x in p.parse('CC'): | |||||
| # print x.pretty() | |||||
| #--------------- | |||||
| # RULES = [ | |||||
| # ('s', ['a', 'a']), | |||||
| # ('a', ['b', 'b']), | |||||
| # ('b', ['C'], lambda (x,): x), | |||||
| # ('b', ['b', 'C']), | |||||
| # ] | |||||
| # p = Parser(RULES, 's', {}) | |||||
| # print p.parse('CCCCC').pretty() | |||||
| @@ -13,7 +13,7 @@ class Rule(object): | |||||
| self.alias = alias | self.alias = alias | ||||
| def __repr__(self): | def __repr__(self): | ||||
| return '<%s : %s>' % (self.origin, ' '.join(map(unicode,self.expansion))) | |||||
| return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) | |||||
| class RulePtr(object): | class RulePtr(object): | ||||
| def __init__(self, rule, index): | def __init__(self, rule, index): | ||||
| @@ -0,0 +1,180 @@ | |||||
| "This module implements an Earley Parser" | |||||
| # The algorithm keeps track of each state set, using a corresponding Column instance. | |||||
| # Column keeps track of new items using NewsList instances. | |||||
| # | |||||
| # Author: Erez Shinan (2017) | |||||
| # Email : erezshin@gmail.com | |||||
| from ..common import ParseError, UnexpectedToken, is_terminal | |||||
| from .grammar_analysis import GrammarAnalyzer | |||||
| class EndToken: | |||||
| type = '$end' | |||||
| END_TOKEN = EndToken() | |||||
| class Item(object): | |||||
| def __init__(self, rule, ptr, start, data): | |||||
| self.rule = rule | |||||
| self.ptr = ptr | |||||
| self.start = start | |||||
| self.data = data | |||||
| @property | |||||
| def expect(self): | |||||
| return self.rule.expansion[self.ptr] | |||||
| @property | |||||
| def is_complete(self): | |||||
| return self.ptr == len(self.rule.expansion) | |||||
| def advance(self, data): | |||||
| return Item(self.rule, self.ptr+1, self.start, self.data + [data]) | |||||
| def __eq__(self, other): | |||||
| return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | |||||
| def __hash__(self): | |||||
| return hash((self.rule, self.ptr, id(self.start))) | |||||
| def __repr__(self): | |||||
| before = map(str, self.rule.expansion[:self.ptr]) | |||||
| after = map(str, self.rule.expansion[self.ptr:]) | |||||
| return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) | |||||
| class NewsList(list): | |||||
| "Keeps track of newly added items (append-only)" | |||||
| def __init__(self, initial=None): | |||||
| list.__init__(self, initial or []) | |||||
| self.last_iter = 0 | |||||
| def get_news(self): | |||||
| i = self.last_iter | |||||
| self.last_iter = len(self) | |||||
| return self[i:] | |||||
| class Column: | |||||
| "An entry in the table, aka Earley Chart" | |||||
| def __init__(self): | |||||
| self.to_reduce = NewsList() | |||||
| self.to_predict = NewsList() | |||||
| self.to_scan = NewsList() | |||||
| self.item_count = 0 | |||||
| self.added = set() | |||||
| def add(self, items): | |||||
| """Sort items into scan/predict/reduce newslists | |||||
| Makes sure only unique items are added. | |||||
| """ | |||||
| added = self.added | |||||
| for item in items: | |||||
| if item.is_complete: | |||||
| # (We must allow repetition of empty rules) | |||||
| # if item.rule.expansion: | |||||
| # This is an important test to avoid infinite-loops, | |||||
| # For example for the rule: | |||||
| # a: a | "b" | |||||
| # If we can detect these cases statically, we can remove | |||||
| # this test an gain a tiny performance boost | |||||
| # | |||||
| # if item in added: | |||||
| # continue | |||||
| # added.add(item) | |||||
| self.to_reduce.append(item) | |||||
| else: | |||||
| if is_terminal(item.expect): | |||||
| self.to_scan.append(item) | |||||
| else: | |||||
| if item in added: | |||||
| continue | |||||
| added.add(item) | |||||
| self.to_predict.append(item) | |||||
| self.item_count += 1 # Only count if actually added | |||||
| def __nonzero__(self): | |||||
| return bool(self.item_count) | |||||
| class Parser: | |||||
| def __init__(self, parser_conf): | |||||
| self.analysis = GrammarAnalyzer(parser_conf.rules, parser_conf.start) | |||||
| self.start = parser_conf.start | |||||
| self.postprocess = {} | |||||
| self.predictions = {} | |||||
| for rule in self.analysis.rules: | |||||
| if rule.origin != '$root': # XXX kinda ugly | |||||
| a = rule.alias | |||||
| self.postprocess[rule] = a if callable(a) else getattr(parser_conf.callback, a) | |||||
| self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||||
| def parse(self, stream, start=None): | |||||
| # Define parser functions | |||||
| start = start or self.start | |||||
| def predict(nonterm, i): | |||||
| assert not is_terminal(nonterm), nonterm | |||||
| return [Item(rule, 0, i, []) for rule in self.predictions[nonterm]] | |||||
| def complete(item): | |||||
| name = item.rule.origin | |||||
| item.data = self.postprocess[item.rule](item.data) | |||||
| return [i.advance(item.data) for i in item.start.to_predict if i.expect == name] | |||||
| def process_column(i, token, cur_set): | |||||
| next_set = Column() | |||||
| while True: | |||||
| to_predict = {x.expect for x in cur_set.to_predict.get_news() | |||||
| if x.ptr} # if not part of an already predicted batch | |||||
| to_reduce = cur_set.to_reduce.get_news() | |||||
| if not (to_predict or to_reduce): | |||||
| break | |||||
| for nonterm in to_predict: | |||||
| cur_set.add( predict(nonterm, cur_set) ) | |||||
| for item in to_reduce: | |||||
| cur_set.add( complete(item) ) | |||||
| if token is not END_TOKEN: | |||||
| for item in cur_set.to_scan.get_news(): | |||||
| match = item.expect[0](token) if callable(item.expect[0]) else item.expect[0] == token.type | |||||
| if match: | |||||
| next_set.add([item.advance(stream[i])]) | |||||
| if not next_set and token is not END_TOKEN: | |||||
| expect = {i.expect[-1] for i in cur_set.to_scan} | |||||
| raise UnexpectedToken(token, expect, stream, i) | |||||
| return cur_set, next_set | |||||
| # Main loop starts | |||||
| column0 = Column() | |||||
| column0.add(predict(start, column0)) | |||||
| cur_set = column0 | |||||
| for i, char in enumerate(stream): | |||||
| _, cur_set = process_column(i, char, cur_set) | |||||
| last_set, _ = process_column(len(stream), END_TOKEN, cur_set) | |||||
| # Parse ended. Now build a parse tree | |||||
| solutions = [n.data for n in last_set.to_reduce | |||||
| if n.rule.origin==start and n.start is column0] | |||||
| if not solutions: | |||||
| raise ParseError('Incomplete parse: Could not find a solution to input') | |||||
| return solutions | |||||
| @@ -2,7 +2,7 @@ import re | |||||
| from collections import defaultdict | from collections import defaultdict | ||||
| from .tree import Tree | from .tree import Tree | ||||
| from .common import is_terminal, ParserConf, PatternStr | |||||
| from .common import is_terminal, ParserConf, PatternStr, Terminal | |||||
| from .lexer import Token | from .lexer import Token | ||||
| from .parsers import earley | from .parsers import earley | ||||
| @@ -26,21 +26,14 @@ class Reconstructor: | |||||
| token_res = {t.name:re.compile(t.pattern.to_regexp()) for t in _tokens} | token_res = {t.name:re.compile(t.pattern.to_regexp()) for t in _tokens} | ||||
| class MatchData(object): | |||||
| def __init__(self, data): | |||||
| self.data = data | |||||
| def __repr__(self): | |||||
| return '%s(%r)' % (type(self).__name__, self.data) | |||||
| class MatchTerminal(MatchData): | |||||
| def __call__(self, other): | |||||
| class MatchTerminal(Terminal): | |||||
| def match(self, other): | |||||
| if isinstance(other, Tree): | if isinstance(other, Tree): | ||||
| return False | return False | ||||
| return token_res[self.data].match(other) is not None | return token_res[self.data].match(other) is not None | ||||
| class MatchTree(MatchData): | |||||
| def __call__(self, other): | |||||
| class MatchTree(Terminal): | |||||
| def match(self, other): | |||||
| try: | try: | ||||
| return self.data == other.data | return self.data == other.data | ||||
| except AttributeError: | except AttributeError: | ||||
| @@ -90,7 +83,7 @@ class Reconstructor: | |||||
| for name, expansions in d.items(): | for name, expansions in d.items(): | ||||
| for expansion in expansions: | for expansion in expansions: | ||||
| reduced = [sym if sym.startswith('_') or sym in expand1s else | reduced = [sym if sym.startswith('_') or sym in expand1s else | ||||
| (MatchTerminal(sym) if is_terminal(sym) else MatchTree(sym),) | |||||
| MatchTerminal(sym) if is_terminal(sym) else MatchTree(sym) | |||||
| for sym in expansion if not is_discarded_terminal(sym)] | for sym in expansion if not is_discarded_terminal(sym)] | ||||
| rules.append((name, reduced, WriteTokens(name, expansion).f)) | rules.append((name, reduced, WriteTokens(name, expansion).f)) | ||||
| @@ -98,9 +91,9 @@ class Reconstructor: | |||||
| def _reconstruct(self, tree): | def _reconstruct(self, tree): | ||||
| parser = earley.Parser(ParserConf(self.rules, {}, tree.data)) | |||||
| res ,= parser.parse(tree.children) # XXX ambiguity? | |||||
| # TODO: ambiguity? | |||||
| parser = earley.Parser(self.rules, tree.data, {}) | |||||
| res = parser.parse(tree.children) | |||||
| for item in res: | for item in res: | ||||
| if isinstance(item, Tree): | if isinstance(item, Tree): | ||||
| for x in self._reconstruct(item): | for x in self._reconstruct(item): | ||||
| @@ -32,7 +32,11 @@ class Tree(object): | |||||
| self.children[i:i+1] = kid.children | self.children[i:i+1] = kid.children | ||||
| def __eq__(self, other): | def __eq__(self, other): | ||||
| return self.data == other.data and self.children == other.children | |||||
| try: | |||||
| return self.data == other.data and self.children == other.children | |||||
| except AttributeError: | |||||
| return False | |||||
| def __hash__(self): | def __hash__(self): | ||||
| return hash((self.data, tuple(self.children))) | return hash((self.data, tuple(self.children))) | ||||
| @@ -57,10 +61,24 @@ class Tree(object): | |||||
| if pred(c): | if pred(c): | ||||
| yield c | yield c | ||||
| def iter_subtrees(self): | |||||
| q = [self] | |||||
| while q: | |||||
| subtree = q.pop() | |||||
| yield subtree | |||||
| q += [c for c in subtree.children if isinstance(c, Tree)] | |||||
| def __deepcopy__(self, memo): | def __deepcopy__(self, memo): | ||||
| return type(self)(self.data, deepcopy(self.children, memo)) | return type(self)(self.data, deepcopy(self.children, memo)) | ||||
| def copy(self): | |||||
| return type(self)(self.data, self.children) | |||||
| def set(self, data, children): | |||||
| self.data = data | |||||
| self.children = children | |||||
| class Transformer(object): | class Transformer(object): | ||||
| @@ -81,7 +99,7 @@ class Transformer(object): | |||||
| class InlineTransformer(Transformer): | class InlineTransformer(Transformer): | ||||
| def _get_func(self, name): | |||||
| def _get_func(self, name): # use super()._get_func | |||||
| return inline_args(getattr(self, name)).__get__(self) | return inline_args(getattr(self, name)).__get__(self) | ||||
| @@ -97,3 +115,35 @@ class Visitor(object): | |||||
| def __default__(self, tree): | def __default__(self, tree): | ||||
| pass | pass | ||||
| class Visitor_NoRecurse(Visitor): | |||||
| def visit(self, tree): | |||||
| subtrees = list(tree.iter_subtrees()) | |||||
| for subtree in reversed(subtrees): | |||||
| getattr(self, subtree.data, self.__default__)(subtree) | |||||
| return tree | |||||
| class Transformer_NoRecurse(Transformer): | |||||
| def transform(self, tree): | |||||
| subtrees = list(tree.iter_subtrees()) | |||||
| def _t(t): | |||||
| # Assumes t is already transformed | |||||
| try: | |||||
| f = self._get_func(t.data) | |||||
| except AttributeError: | |||||
| return self.__default__(t) | |||||
| else: | |||||
| return f(t) | |||||
| for subtree in reversed(subtrees): | |||||
| subtree.children = [_t(c) if isinstance(c, Tree) else c for c in subtree.children] | |||||
| return _t(tree) | |||||
| def __default__(self, t): | |||||
| return t | |||||
| @@ -69,3 +69,14 @@ def inline_args(f): | |||||
| return f.__func__(self, *args) | return f.__func__(self, *args) | ||||
| return _f | return _f | ||||
| try: | |||||
| compare = cmp | |||||
| except NameError: | |||||
| def compare(a, b): | |||||
| if a == b: | |||||
| return 0 | |||||
| elif a > b: | |||||
| return 1 | |||||
| else: | |||||
| return -1 | |||||
| @@ -73,6 +73,28 @@ class TestEarley(unittest.TestCase): | |||||
| l = Lark(grammar, parser='earley', lexer=None) | l = Lark(grammar, parser='earley', lexer=None) | ||||
| l.parse(program) | l.parse(program) | ||||
| def test_earley_scanless3(self): | |||||
| "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" | |||||
| grammar = """ | |||||
| start: A A | |||||
| A: "a"+ | |||||
| """ | |||||
| l = Lark(grammar, parser='earley', lexer=None) | |||||
| res = l.parse("aaa") | |||||
| self.assertEqual(res.children, ['aa', 'a']) | |||||
| def test_earley_scanless4(self): | |||||
| grammar = """ | |||||
| start: A A? | |||||
| A: "a"+ | |||||
| """ | |||||
| l = Lark(grammar, parser='earley', lexer=None) | |||||
| res = l.parse("aaa") | |||||
| self.assertEqual(res.children, ['aaa']) | |||||
| def _make_parser_test(LEXER, PARSER): | def _make_parser_test(LEXER, PARSER): | ||||
| def _Lark(grammar, **kwargs): | def _Lark(grammar, **kwargs): | ||||
| return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs) | return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs) | ||||