From c17558dd91a01990408bda747deb15fbd13c0493 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 20 Feb 2017 20:15:29 +0200 Subject: [PATCH] Official switched to my Earley implementation --- lark/parser_frontends.py | 32 ++--- lark/parsers/earley.py | 276 +++++++++++++++++++-------------------- lark/parsers/earley2.py | 144 -------------------- lark/parsers/nearley.py | 155 ++++++++++++++++++++++ 4 files changed, 301 insertions(+), 306 deletions(-) delete mode 100644 lark/parsers/earley2.py create mode 100644 lark/parsers/nearley.py diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index e9f117c..891615a 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,10 +1,10 @@ import re import sre_parse -from .lexer import Lexer, ContextualLexer +from .lexer import Lexer, ContextualLexer, Token from .common import is_terminal, GrammarError, ParserConf -from .parsers import lalr_parser, earley, earley2 +from .parsers import lalr_parser, earley, nearley from .parsers.grammar_analysis import Rule class WithLexer: @@ -56,18 +56,14 @@ class Nearley(WithLexer): WithLexer.__init__(self, lexer_conf) rules = [{'name':n, - 'symbols': list(self._prepare_expansion(x)), + 'symbols': self._prepare_expansion(x), 'postprocess': getattr(parser_conf.callback, a)} for n,x,a in parser_conf.rules] - self.parser = earley.Parser(rules, parser_conf.start) + self.parser = nearley.Parser(rules, parser_conf.start) def _prepare_expansion(self, expansion): - for sym in expansion: - if is_terminal(sym): - yield sym, None - else: - yield sym + return [(sym, None) if is_terminal(sym) else sym for sym in expansion] def parse(self, text): tokens = list(self.lex(text)) @@ -76,14 +72,14 @@ class Nearley(WithLexer): return res[0] -class MyEarley(WithLexer): +class Earley(WithLexer): def __init__(self, lexer_conf, parser_conf): WithLexer.__init__(self, lexer_conf) rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] - self.parser = earley2.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) + self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) def _prepare_expansion(self, expansion): return [(sym,) if is_terminal(sym) else sym for sym in expansion] @@ -95,7 +91,7 @@ class MyEarley(WithLexer): return res[0] -class Earley_NoLex: +class Nearley_NoLex: def __init__(self, lexer_conf, parser_conf): self.token_by_name = {t.name:t for t in lexer_conf.tokens} @@ -104,7 +100,7 @@ class Earley_NoLex: 'postprocess': getattr(parser_conf.callback, a)} for n,x,a in parser_conf.rules] - self.parser = earley.Parser(rules, parser_conf.start) + self.parser = nearley.Parser(rules, parser_conf.start) def _prepare_expansion(self, expansion): for sym in expansion: @@ -123,14 +119,14 @@ class Earley_NoLex: return res[0] -class MyEarley_NoLex: +class Earley_NoLex: def __init__(self, lexer_conf, parser_conf): self.token_by_name = {t.name:t for t in lexer_conf.tokens} rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] - self.parser = earley2.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) + self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) def _prepare_expansion(self, expansion): for sym in expansion: @@ -139,18 +135,18 @@ class MyEarley_NoLex: width = sre_parse.parse(regexp).getwidth() if not width == (1,1): raise GrammarError('Dynamic lexing requires all tokens to have a width of 1 (%s is %s)' % (regexp, width)) - yield re.compile(regexp).match + yield (re.compile(regexp).match,) else: yield sym def parse(self, text): - res = self.parser.parse(text) + res = self.parser.parse([Token(x,x) for x in text]) # A little hacky perhaps! assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' return res[0] ENGINE_DICT = { 'lalr': LALR, - 'earley': MyEarley, + 'earley': Earley, 'earley_nolex': Earley_NoLex, 'lalr_contextual_lexer': LALR_ContextualLexer } diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index b2a511e..2887a52 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -1,155 +1,143 @@ -"My name is Earley" +from ..common import ParseError, UnexpectedToken, is_terminal +from .grammar_analysis import GrammarAnalyzer -from ..utils import classify, STRING_TYPE -from ..common import ParseError, UnexpectedToken +# is_terminal = callable -try: - xrange -except NameError: - xrange = range - -class MatchFailed(object): - pass - -class AbortParseMatch(Exception): - pass - - -class Rule(object): - def __init__(self, name, symbols, postprocess): - self.name = name - self.symbols = symbols - self.postprocess = postprocess - -class State(object): - def __init__(self, rule, expect, reference, data=None): +class Item: + def __init__(self, rule, ptr, start, data): self.rule = rule - self.expect = expect - self.reference = reference - self.data = data or [] - - self.is_complete = (self.expect == len(self.rule.symbols)) - if not self.is_complete: - self.expect_symbol = self.rule.symbols[self.expect] - self.is_terminal = isinstance(self.expect_symbol, tuple) - else: - self.is_terminal = False - - def next_state(self, data): - return State(self.rule, self.expect+1, self.reference, self.data + [data]) - - def consume_terminal(self, inp): - if not self.is_complete and self.is_terminal: - # PORT: originally tests regexp - - if self.expect_symbol[1] is not None: - match = self.expect_symbol[1].match(inp) - if match: - return self.next_state(inp) - - elif self.expect_symbol[0] == inp.type: - return self.next_state(inp) - - def consume_nonterminal(self, inp): - if not self.is_complete and not self.is_terminal: - - if self.expect_symbol == inp: - return self.next_state(inp) - - def process(self, location, ind, table, rules, added_rules): - - if self.is_complete: - # Completed a rule - if self.rule.postprocess: - try: - self.data = self.rule.postprocess(self.data) - except AbortParseMatch: - self.data = MatchFailed - - if self.data is not MatchFailed: - for s in table[self.reference]: - x = s.consume_nonterminal(self.rule.name) - if x: - x.data[-1] = self.data - x.epsilon_closure(location, ind, table) - - else: - exp = self.rule.symbols[self.expect] - if isinstance(exp, tuple): - return - - for r in rules[exp]: - assert r.name == exp - if r not in added_rules: - if r.symbols: - added_rules.add(r) - State(r, 0, location).epsilon_closure(location, ind, table) - else: - # Empty rule - new_copy = self.consume_nonterminal(r.name) - new_copy.data[-1] = r.postprocess([]) if r.postprocess else [] + self.ptr = ptr + self.start = start + self.data = data - new_copy.epsilon_closure(location, ind, table) + @property + def expect(self): + return self.rule.expansion[self.ptr] - def epsilon_closure(self, location, ind, table): - col = table[location] - col.append(self) + @property + def is_complete(self): + return self.ptr == len(self.rule.expansion) - if not self.is_complete: - for i in xrange(ind): - state = col[i] - if state.is_complete and state.reference == location: - x = self.consume_nonterminal(state.rule.name) - if x: - x.data[-1] = state.data - x.epsilon_closure(location, ind, table) + def advance(self, data): + return Item(self.rule, self.ptr+1, self.start, self.data + [data]) + def __eq__(self, other): + return self.start == other.start and self.ptr == other.ptr and self.rule == other.rule + def __hash__(self): + return hash((self.rule, self.ptr, self.start)) -class Parser(object): - def __init__(self, rules, start=None): - self.rules = [Rule(r['name'], r['symbols'], r.get('postprocess', None)) for r in rules] - self.rules_by_name = classify(self.rules, lambda r: r.name) - self.start = start or self.rules[0].name - def advance_to(self, table, added_rules): - n = len(table)-1 - for w, s in enumerate(table[n]): - s.process(n, w, table, self.rules_by_name, added_rules) +class Parser: + def __init__(self, parser_conf): + self.analysis = GrammarAnalyzer(parser_conf.rules, parser_conf.start) + self.start = parser_conf.start + + self.postprocess = {} + self.predictions = {} + for rule in self.analysis.rules: + if rule.origin != '$root': # XXX kinda ugly + self.postprocess[rule] = getattr(parser_conf.callback, rule.alias) + self.predictions[rule.origin] = [(x.rule, x.index) for x in self.analysis.expand_rule(rule.origin)] def parse(self, stream): - initial_rules = set(self.rules_by_name[self.start]) - table = [[State(r, 0, 0) for r in initial_rules]] - self.advance_to(table, initial_rules) - - i = 0 - - while i < len(stream): - col = [] - - token = stream[i] - for s in table[-1]: - x = s.consume_terminal(token) - if x: - col.append(x) - - if not col: - expected = {s.expect_symbol for s in table[-1] if s.is_terminal} - raise UnexpectedToken(stream[i], expected, stream, i) - - table.append(col) - self.advance_to(table, set()) - - i += 1 - - res = list(self.finish(table)) - if not res: - raise ParseError('Incomplete parse') - return res - - def finish(self, table): - for t in table[-1]: - if (t.rule.name == self.start - and t.expect == len(t.rule.symbols) - and t.reference == 0 - and t.data is not MatchFailed): - yield t.data + # Define parser functions + + def predict(symbol, i): + assert not is_terminal(symbol), symbol + return {Item(rule, index, i, []) for rule, index in self.predictions[symbol]} + + def complete(item, table): + #item.data = (item.rule_ptr.rule, item.data) + item.data = self.postprocess[item.rule](item.data) + return {old_item.advance(item.data) for old_item in table[item.start] + if not old_item.is_complete and old_item.expect == item.rule.origin} + + def process_column(i, term): + assert i == len(table)-1 + cur_set = table[i] + next_set = set() + + to_process = cur_set + while to_process: + new_items = set() + for item in to_process: + if item.is_complete: + new_items |= complete(item, table) + else: + if is_terminal(item.expect): + # scan + match = item.expect[0](term) if callable(item.expect[0]) else item.expect[0] == term + if match: + next_set.add(item.advance(stream[i])) + else: + if item.ptr: # part of an already predicted batch + new_items |= predict(item.expect, i) + + to_process = new_items - cur_set # TODO: is this precaution necessary? + cur_set |= to_process + + + if not next_set and term != '$end': + expect = filter(is_terminal, [x.expect for x in cur_set if not x.is_complete]) + raise UnexpectedToken(term, expect, stream, i) + + table.append(next_set) + + # Main loop starts + + table = [predict(self.start, 0)] + + for i, char in enumerate(stream): + process_column(i, char.type) + + process_column(len(stream), '$end') + + # Parse ended. Now build a parse tree + solutions = [n.data for n in table[len(stream)] + if n.is_complete and n.rule.origin==self.start and n.start==0] + + if not solutions: + raise ParseError('Incomplete parse: Could not find a solution to input') + + return solutions + #return map(self.reduce_solution, solutions) + + def reduce_solution(self, solution): + rule, children = solution + children = [self.reduce_solution(c) if isinstance(c, tuple) else c for c in children] + return self.postprocess[rule](children) + + + +from ..common import ParserConf +# A = 'A'.__eq__ +# rules = [ +# ('a', ['a', A], None), +# ('a', ['a', A, 'a'], None), +# ('a', ['a', A, A, 'a'], None), +# ('a', [A], None), +# ] + +# p = Parser(ParserConf(rules, None, 'a')) +# for x in p.parse('AAAA'): +# print '->' +# print x.pretty() + +# import re +# NUM = re.compile('[0-9]').match +# ADD = re.compile('[+-]').match +# MUL = re.compile('[*/]').match +# rules = [ +# ('sum', ['sum', ADD, 'product'], None), +# ('sum', ['product'], None), +# ('product', ['product', MUL, 'factor'], None), +# ('product', ['factor'], None), +# ('factor', ['('.__eq__, 'sum', ')'.__eq__], None), +# ('factor', ['number'], None), +# ('number', [NUM, 'number'], None), +# ('number', [NUM], None), +# ] + +# p = Parser(ParserConf(rules, None, 'sum')) +# # print p.parse('NALNMNANR') +# print p.parse('1+(2*3-4)')[0].pretty() diff --git a/lark/parsers/earley2.py b/lark/parsers/earley2.py deleted file mode 100644 index 6348747..0000000 --- a/lark/parsers/earley2.py +++ /dev/null @@ -1,144 +0,0 @@ -import sys - -from ..common import ParseError, UnexpectedToken, is_terminal -from grammar_analysis import GrammarAnalyzer - -# is_terminal = callable - -class Item: - def __init__(self, rule, ptr, start, data): - self.rule = rule - self.ptr = ptr - self.start = start - self.data = data - - @property - def expect(self): - return self.rule.expansion[self.ptr] - - @property - def is_complete(self): - return self.ptr == len(self.rule.expansion) - - def advance(self, data): - return Item(self.rule, self.ptr+1, self.start, self.data + [data]) - - def __eq__(self, other): - return self.start == other.start and self.ptr == other.ptr and self.rule == other.rule - def __hash__(self): - return hash((self.rule, self.ptr, self.start)) - - -class Parser: - def __init__(self, parser_conf): - self.analysis = GrammarAnalyzer(parser_conf.rules, parser_conf.start) - self.start = parser_conf.start - - self.postprocess = {} - self.predictions = {} - for rule in self.analysis.rules: - if rule.origin != '$root': # XXX kinda ugly - self.postprocess[rule] = getattr(parser_conf.callback, rule.alias) - self.predictions[rule.origin] = [(x.rule, x.index) for x in self.analysis.expand_rule(rule.origin)] - - def parse(self, stream): - # Define parser functions - - def predict(symbol, i): - assert not is_terminal(symbol), symbol - return {Item(rule, index, i, []) for rule, index in self.predictions[symbol]} - - def complete(item, table): - #item.data = (item.rule_ptr.rule, item.data) - item.data = self.postprocess[item.rule](item.data) - return {old_item.advance(item.data) for old_item in table[item.start] - if not old_item.is_complete and old_item.expect == item.rule.origin} - - def process_column(i, term): - assert i == len(table)-1 - cur_set = table[i] - next_set = set() - - to_process = cur_set - while to_process: - new_items = set() - for item in to_process: - if item.is_complete: - new_items |= complete(item, table) - else: - if is_terminal(item.expect): - # scan - if item.expect[0] == term: - next_set.add(item.advance(stream[i])) - else: - if item.ptr: # part of an already predicted batch - new_items |= predict(item.expect, i) - - to_process = new_items - cur_set # TODO: is this precaution necessary? - cur_set |= to_process - - - if not next_set and term != '$end': - expect = filter(is_terminal, [x.expect for x in cur_set if not x.is_complete]) - raise UnexpectedToken(term, expect, stream, i) - - table.append(next_set) - - # Main loop starts - - table = [predict(self.start, 0)] - - for i, char in enumerate(stream): - process_column(i, char.type) - - process_column(len(stream), '$end') - - # Parse ended. Now build a parse tree - solutions = [n.data for n in table[len(stream)] - if n.is_complete and n.rule.origin==self.start and n.start==0] - - if not solutions: - raise ParseError('Incomplete parse: Could not find a solution to input') - - return solutions - #return map(self.reduce_solution, solutions) - - def reduce_solution(self, solution): - rule, children = solution - children = [self.reduce_solution(c) if isinstance(c, tuple) else c for c in children] - return self.postprocess[rule](children) - - - -from ..common import ParserConf -# A = 'A'.__eq__ -# rules = [ -# ('a', ['a', A], None), -# ('a', ['a', A, 'a'], None), -# ('a', ['a', A, A, 'a'], None), -# ('a', [A], None), -# ] - -# p = Parser(ParserConf(rules, None, 'a')) -# for x in p.parse('AAAA'): -# print '->' -# print x.pretty() - -# import re -# NUM = re.compile('[0-9]').match -# ADD = re.compile('[+-]').match -# MUL = re.compile('[*/]').match -# rules = [ -# ('sum', ['sum', ADD, 'product'], None), -# ('sum', ['product'], None), -# ('product', ['product', MUL, 'factor'], None), -# ('product', ['factor'], None), -# ('factor', ['('.__eq__, 'sum', ')'.__eq__], None), -# ('factor', ['number'], None), -# ('number', [NUM, 'number'], None), -# ('number', [NUM], None), -# ] - -# p = Parser(ParserConf(rules, None, 'sum')) -# # print p.parse('NALNMNANR') -# print p.parse('1+(2*3-4)')[0].pretty() diff --git a/lark/parsers/nearley.py b/lark/parsers/nearley.py new file mode 100644 index 0000000..b2a511e --- /dev/null +++ b/lark/parsers/nearley.py @@ -0,0 +1,155 @@ +"My name is Earley" + +from ..utils import classify, STRING_TYPE +from ..common import ParseError, UnexpectedToken + +try: + xrange +except NameError: + xrange = range + +class MatchFailed(object): + pass + +class AbortParseMatch(Exception): + pass + + +class Rule(object): + def __init__(self, name, symbols, postprocess): + self.name = name + self.symbols = symbols + self.postprocess = postprocess + +class State(object): + def __init__(self, rule, expect, reference, data=None): + self.rule = rule + self.expect = expect + self.reference = reference + self.data = data or [] + + self.is_complete = (self.expect == len(self.rule.symbols)) + if not self.is_complete: + self.expect_symbol = self.rule.symbols[self.expect] + self.is_terminal = isinstance(self.expect_symbol, tuple) + else: + self.is_terminal = False + + def next_state(self, data): + return State(self.rule, self.expect+1, self.reference, self.data + [data]) + + def consume_terminal(self, inp): + if not self.is_complete and self.is_terminal: + # PORT: originally tests regexp + + if self.expect_symbol[1] is not None: + match = self.expect_symbol[1].match(inp) + if match: + return self.next_state(inp) + + elif self.expect_symbol[0] == inp.type: + return self.next_state(inp) + + def consume_nonterminal(self, inp): + if not self.is_complete and not self.is_terminal: + + if self.expect_symbol == inp: + return self.next_state(inp) + + def process(self, location, ind, table, rules, added_rules): + + if self.is_complete: + # Completed a rule + if self.rule.postprocess: + try: + self.data = self.rule.postprocess(self.data) + except AbortParseMatch: + self.data = MatchFailed + + if self.data is not MatchFailed: + for s in table[self.reference]: + x = s.consume_nonterminal(self.rule.name) + if x: + x.data[-1] = self.data + x.epsilon_closure(location, ind, table) + + else: + exp = self.rule.symbols[self.expect] + if isinstance(exp, tuple): + return + + for r in rules[exp]: + assert r.name == exp + if r not in added_rules: + if r.symbols: + added_rules.add(r) + State(r, 0, location).epsilon_closure(location, ind, table) + else: + # Empty rule + new_copy = self.consume_nonterminal(r.name) + new_copy.data[-1] = r.postprocess([]) if r.postprocess else [] + + new_copy.epsilon_closure(location, ind, table) + + def epsilon_closure(self, location, ind, table): + col = table[location] + col.append(self) + + if not self.is_complete: + for i in xrange(ind): + state = col[i] + if state.is_complete and state.reference == location: + x = self.consume_nonterminal(state.rule.name) + if x: + x.data[-1] = state.data + x.epsilon_closure(location, ind, table) + + +class Parser(object): + def __init__(self, rules, start=None): + self.rules = [Rule(r['name'], r['symbols'], r.get('postprocess', None)) for r in rules] + self.rules_by_name = classify(self.rules, lambda r: r.name) + self.start = start or self.rules[0].name + + def advance_to(self, table, added_rules): + n = len(table)-1 + for w, s in enumerate(table[n]): + s.process(n, w, table, self.rules_by_name, added_rules) + + def parse(self, stream): + initial_rules = set(self.rules_by_name[self.start]) + table = [[State(r, 0, 0) for r in initial_rules]] + self.advance_to(table, initial_rules) + + i = 0 + + while i < len(stream): + col = [] + + token = stream[i] + for s in table[-1]: + x = s.consume_terminal(token) + if x: + col.append(x) + + if not col: + expected = {s.expect_symbol for s in table[-1] if s.is_terminal} + raise UnexpectedToken(stream[i], expected, stream, i) + + table.append(col) + self.advance_to(table, set()) + + i += 1 + + res = list(self.finish(table)) + if not res: + raise ParseError('Incomplete parse') + return res + + def finish(self, table): + for t in table[-1]: + if (t.rule.name == self.start + and t.expect == len(t.rule.symbols) + and t.reference == 0 + and t.data is not MatchFailed): + yield t.data