diff --git a/lark/common.py b/lark/common.py index 06220f0..122c7e5 100644 --- a/lark/common.py +++ b/lark/common.py @@ -28,7 +28,7 @@ class UnexpectedToken(ParseError): def is_terminal(sym): - return sym.isupper() or sym[0] == '$' + return isinstance(sym, tuple) or sym.isupper() or sym[0] == '$' class LexerConf: diff --git a/lark/lexer.py b/lark/lexer.py index 301d555..db5dde7 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -197,14 +197,19 @@ class ContextualLexer: self.root_lexer = Lexer(tokens, ignore=ignore) - def lex(self, stream, parser): + self.set_parser_state(None) # Needs to be set on the outside + + def set_parser_state(self, state): + self.parser_state = state + + def lex(self, stream): lex_pos = 0 line = 1 col_start_pos = 0 newline_types = list(self.root_lexer.newline_types) ignore_types = list(self.root_lexer.ignore_types) while True: - lexer = self.lexers[parser.state] + lexer = self.lexers[self.parser_state] for mre, type_from_index in lexer.mres: m = mre.match(stream, lex_pos) if m: diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 54b67bb..891615a 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,11 +1,11 @@ import re import sre_parse -from .lexer import Lexer, ContextualLexer -from .parsers.lalr_analysis import GrammarAnalyzer +from .lexer import Lexer, ContextualLexer, Token -from .common import is_terminal, GrammarError -from .parsers import lalr_parser, earley +from .common import is_terminal, GrammarError, ParserConf +from .parsers import lalr_parser, earley, nearley +from .parsers.grammar_analysis import Rule class WithLexer: def __init__(self, lexer_conf): @@ -22,11 +22,9 @@ class WithLexer: class LALR(WithLexer): def __init__(self, lexer_conf, parser_conf): WithLexer.__init__(self, lexer_conf) - self.parser_conf = parser_conf - analyzer = GrammarAnalyzer(parser_conf.rules, parser_conf.start) - analyzer.analyze() - self.parser = lalr_parser.Parser(analyzer, parser_conf.callback) + self.parser_conf = parser_conf + self.parser = lalr_parser.Parser(parser_conf) def parse(self, text): tokens = list(self.lex(text)) @@ -37,41 +35,35 @@ class LALR_ContextualLexer: self.lexer_conf = lexer_conf self.parser_conf = parser_conf - self.analyzer = GrammarAnalyzer(parser_conf.rules, parser_conf.start) - self.analyzer.analyze() + self.parser = lalr_parser.Parser(parser_conf) - d = {idx:t.keys() for idx, t in self.analyzer.states_idx.items()} + d = {idx:t.keys() for idx, t in self.parser.analysis.states_idx.items()} self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=lexer_conf.postlex.always_accept if lexer_conf.postlex else ()) def parse(self, text): - parser = lalr_parser.Parser(self.analyzer, self.parser_conf.callback) - tokens = self.lexer.lex(text, parser) + tokens = self.lexer.lex(text) if self.lexer_conf.postlex: tokens = self.lexer_conf.postlex.process(tokens) - return parser.parse(tokens, True) + return self.parser.parse(tokens, self.lexer.set_parser_state) -class Earley(WithLexer): +class Nearley(WithLexer): def __init__(self, lexer_conf, parser_conf): WithLexer.__init__(self, lexer_conf) rules = [{'name':n, - 'symbols': list(self._prepare_expansion(x)), + 'symbols': self._prepare_expansion(x), 'postprocess': getattr(parser_conf.callback, a)} for n,x,a in parser_conf.rules] - self.parser = earley.Parser(rules, parser_conf.start) + self.parser = nearley.Parser(rules, parser_conf.start) def _prepare_expansion(self, expansion): - for sym in expansion: - if is_terminal(sym): - yield sym, None - else: - yield sym + return [(sym, None) if is_terminal(sym) else sym for sym in expansion] def parse(self, text): tokens = list(self.lex(text)) @@ -79,7 +71,27 @@ class Earley(WithLexer): assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' return res[0] -class Earley_NoLex: + +class Earley(WithLexer): + def __init__(self, lexer_conf, parser_conf): + WithLexer.__init__(self, lexer_conf) + + rules = [(n, self._prepare_expansion(x), a) + for n,x,a in parser_conf.rules] + + self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) + + def _prepare_expansion(self, expansion): + return [(sym,) if is_terminal(sym) else sym for sym in expansion] + + def parse(self, text): + tokens = list(self.lex(text)) + res = self.parser.parse(tokens) + assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' + return res[0] + + +class Nearley_NoLex: def __init__(self, lexer_conf, parser_conf): self.token_by_name = {t.name:t for t in lexer_conf.tokens} @@ -88,7 +100,7 @@ class Earley_NoLex: 'postprocess': getattr(parser_conf.callback, a)} for n,x,a in parser_conf.rules] - self.parser = earley.Parser(rules, parser_conf.start) + self.parser = nearley.Parser(rules, parser_conf.start) def _prepare_expansion(self, expansion): for sym in expansion: @@ -106,4 +118,35 @@ class Earley_NoLex: assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' return res[0] -ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex, 'lalr_contextual_lexer': LALR_ContextualLexer } + +class Earley_NoLex: + def __init__(self, lexer_conf, parser_conf): + self.token_by_name = {t.name:t for t in lexer_conf.tokens} + + rules = [(n, list(self._prepare_expansion(x)), a) + for n,x,a in parser_conf.rules] + + self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) + + def _prepare_expansion(self, expansion): + for sym in expansion: + if is_terminal(sym): + regexp = self.token_by_name[sym].to_regexp() + width = sre_parse.parse(regexp).getwidth() + if not width == (1,1): + raise GrammarError('Dynamic lexing requires all tokens to have a width of 1 (%s is %s)' % (regexp, width)) + yield (re.compile(regexp).match,) + else: + yield sym + + def parse(self, text): + res = self.parser.parse([Token(x,x) for x in text]) # A little hacky perhaps! + assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' + return res[0] + +ENGINE_DICT = { + 'lalr': LALR, + 'earley': Earley, + 'earley_nolex': Earley_NoLex, + 'lalr_contextual_lexer': LALR_ContextualLexer +} diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index b2a511e..0ba74a2 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -1,155 +1,98 @@ -"My name is Earley" +from ..common import ParseError, UnexpectedToken, is_terminal +from .grammar_analysis import GrammarAnalyzer -from ..utils import classify, STRING_TYPE -from ..common import ParseError, UnexpectedToken +class Item: + def __init__(self, rule, ptr, start, data): + self.rule = rule + self.ptr = ptr + self.start = start + self.data = data -try: - xrange -except NameError: - xrange = range + @property + def expect(self): + return self.rule.expansion[self.ptr] -class MatchFailed(object): - pass + @property + def is_complete(self): + return self.ptr == len(self.rule.expansion) -class AbortParseMatch(Exception): - pass + def advance(self, data): + return Item(self.rule, self.ptr+1, self.start, self.data + [data]) + def __eq__(self, other): + return self.start == other.start and self.ptr == other.ptr and self.rule == other.rule + def __hash__(self): + return hash((self.rule, self.ptr, self.start)) -class Rule(object): - def __init__(self, name, symbols, postprocess): - self.name = name - self.symbols = symbols - self.postprocess = postprocess -class State(object): - def __init__(self, rule, expect, reference, data=None): - self.rule = rule - self.expect = expect - self.reference = reference - self.data = data or [] - - self.is_complete = (self.expect == len(self.rule.symbols)) - if not self.is_complete: - self.expect_symbol = self.rule.symbols[self.expect] - self.is_terminal = isinstance(self.expect_symbol, tuple) - else: - self.is_terminal = False - - def next_state(self, data): - return State(self.rule, self.expect+1, self.reference, self.data + [data]) - - def consume_terminal(self, inp): - if not self.is_complete and self.is_terminal: - # PORT: originally tests regexp - - if self.expect_symbol[1] is not None: - match = self.expect_symbol[1].match(inp) - if match: - return self.next_state(inp) - - elif self.expect_symbol[0] == inp.type: - return self.next_state(inp) - - def consume_nonterminal(self, inp): - if not self.is_complete and not self.is_terminal: - - if self.expect_symbol == inp: - return self.next_state(inp) - - def process(self, location, ind, table, rules, added_rules): - - if self.is_complete: - # Completed a rule - if self.rule.postprocess: - try: - self.data = self.rule.postprocess(self.data) - except AbortParseMatch: - self.data = MatchFailed - - if self.data is not MatchFailed: - for s in table[self.reference]: - x = s.consume_nonterminal(self.rule.name) - if x: - x.data[-1] = self.data - x.epsilon_closure(location, ind, table) - - else: - exp = self.rule.symbols[self.expect] - if isinstance(exp, tuple): - return - - for r in rules[exp]: - assert r.name == exp - if r not in added_rules: - if r.symbols: - added_rules.add(r) - State(r, 0, location).epsilon_closure(location, ind, table) +class Parser: + def __init__(self, parser_conf): + self.analysis = GrammarAnalyzer(parser_conf.rules, parser_conf.start) + self.start = parser_conf.start + + self.postprocess = {} + self.predictions = {} + for rule in self.analysis.rules: + if rule.origin != '$root': # XXX kinda ugly + self.postprocess[rule] = getattr(parser_conf.callback, rule.alias) + self.predictions[rule.origin] = [(x.rule, x.index) for x in self.analysis.expand_rule(rule.origin)] + + def parse(self, stream): + # Define parser functions + + def predict(symbol, i): + assert not is_terminal(symbol), symbol + return {Item(rule, index, i, []) for rule, index in self.predictions[symbol]} + + def complete(item, table): + item.data = self.postprocess[item.rule](item.data) + return {old_item.advance(item.data) for old_item in table[item.start] + if not old_item.is_complete and old_item.expect == item.rule.origin} + + def process_column(i, term): + assert i == len(table)-1 + cur_set = table[i] + next_set = set() + + to_process = cur_set + while to_process: + new_items = set() + for item in to_process: + if item.is_complete: + new_items |= complete(item, table) else: - # Empty rule - new_copy = self.consume_nonterminal(r.name) - new_copy.data[-1] = r.postprocess([]) if r.postprocess else [] + if is_terminal(item.expect): + # scan + match = item.expect[0](term) if callable(item.expect[0]) else item.expect[0] == term + if match: + next_set.add(item.advance(stream[i])) + else: + if item.ptr: # part of an already predicted batch + new_items |= predict(item.expect, i) - new_copy.epsilon_closure(location, ind, table) + to_process = new_items - cur_set # TODO: is this precaution necessary? + cur_set |= to_process - def epsilon_closure(self, location, ind, table): - col = table[location] - col.append(self) - if not self.is_complete: - for i in xrange(ind): - state = col[i] - if state.is_complete and state.reference == location: - x = self.consume_nonterminal(state.rule.name) - if x: - x.data[-1] = state.data - x.epsilon_closure(location, ind, table) + if not next_set and term != '$end': + expect = filter(is_terminal, [x.expect for x in cur_set if not x.is_complete]) + raise UnexpectedToken(term, expect, stream, i) + table.append(next_set) -class Parser(object): - def __init__(self, rules, start=None): - self.rules = [Rule(r['name'], r['symbols'], r.get('postprocess', None)) for r in rules] - self.rules_by_name = classify(self.rules, lambda r: r.name) - self.start = start or self.rules[0].name + # Main loop starts + table = [predict(self.start, 0)] - def advance_to(self, table, added_rules): - n = len(table)-1 - for w, s in enumerate(table[n]): - s.process(n, w, table, self.rules_by_name, added_rules) + for i, char in enumerate(stream): + process_column(i, char.type) - def parse(self, stream): - initial_rules = set(self.rules_by_name[self.start]) - table = [[State(r, 0, 0) for r in initial_rules]] - self.advance_to(table, initial_rules) - - i = 0 - - while i < len(stream): - col = [] - - token = stream[i] - for s in table[-1]: - x = s.consume_terminal(token) - if x: - col.append(x) - - if not col: - expected = {s.expect_symbol for s in table[-1] if s.is_terminal} - raise UnexpectedToken(stream[i], expected, stream, i) - - table.append(col) - self.advance_to(table, set()) - - i += 1 - - res = list(self.finish(table)) - if not res: - raise ParseError('Incomplete parse') - return res - - def finish(self, table): - for t in table[-1]: - if (t.rule.name == self.start - and t.expect == len(t.rule.symbols) - and t.reference == 0 - and t.data is not MatchFailed): - yield t.data + process_column(len(stream), '$end') + + # Parse ended. Now build a parse tree + solutions = [n.data for n in table[len(stream)] + if n.is_complete and n.rule.origin==self.start and n.start==0] + + if not solutions: + raise ParseError('Incomplete parse: Could not find a solution to input') + + return solutions diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py new file mode 100644 index 0000000..c03d9ae --- /dev/null +++ b/lark/parsers/grammar_analysis.py @@ -0,0 +1,156 @@ + +from ..utils import bfs, fzset +from ..common import GrammarError, is_terminal + +class Rule(object): + """ + origin : a symbol + expansion : a list of symbols + """ + def __init__(self, origin, expansion, alias=None): + self.origin = origin + self.expansion = expansion + self.alias = alias + + def __repr__(self): + return '<%s : %s>' % (self.origin, ' '.join(self.expansion)) + +class RulePtr(object): + def __init__(self, rule, index): + assert isinstance(rule, Rule) + assert index <= len(rule.expansion) + self.rule = rule + self.index = index + + def __repr__(self): + before = self.rule.expansion[:self.index] + after = self.rule.expansion[self.index:] + return '<%s : %s * %s>' % (self.rule.origin, ' '.join(before), ' '.join(after)) + + @property + def next(self): + return self.rule.expansion[self.index] + + def advance(self, sym): + assert self.next == sym + return RulePtr(self.rule, self.index+1) + + @property + def is_satisfied(self): + return self.index == len(self.rule.expansion) + + def __eq__(self, other): + return self.rule == other.rule and self.index == other.index + def __hash__(self): + return hash((self.rule, self.index)) + + +def pairs(lst): + return zip(lst[:-1], lst[1:]) + +def update_set(set1, set2): + copy = set(set1) + set1 |= set2 + return set1 != copy + +def calculate_sets(rules): + """Calculate FOLLOW sets. + + Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets""" + symbols = {sym for rule in rules for sym in rule.expansion} | {rule.origin for rule in rules} + symbols.add('$root') # what about other unused rules? + + # foreach grammar rule X ::= Y(1) ... Y(k) + # if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then + # NULLABLE = NULLABLE union {X} + # for i = 1 to k + # if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then + # FIRST(X) = FIRST(X) union FIRST(Y(i)) + # for j = i+1 to k + # if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then + # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X) + # if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then + # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j)) + # until none of NULLABLE,FIRST,FOLLOW changed in last iteration + + NULLABLE = set() + FIRST = {} + FOLLOW = {} + for sym in symbols: + FIRST[sym]={sym} if is_terminal(sym) else set() + FOLLOW[sym]=set() + + changed = True + while changed: + changed = False + + for rule in rules: + if set(rule.expansion) <= NULLABLE: + if update_set(NULLABLE, {rule.origin}): + changed = True + + for i, sym in enumerate(rule.expansion): + if set(rule.expansion[:i]) <= NULLABLE: + if update_set(FIRST[rule.origin], FIRST[sym]): + changed = True + if i==len(rule.expansion)-1 or set(rule.expansion[i:]) <= NULLABLE: + if update_set(FOLLOW[sym], FOLLOW[rule.origin]): + changed = True + + for j in range(i+1, len(rule.expansion)): + if set(rule.expansion[i+1:j]) <= NULLABLE: + if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]): + changed = True + + return FIRST, FOLLOW, NULLABLE + + +class GrammarAnalyzer(object): + def __init__(self, rule_tuples, start_symbol, debug=False): + self.start_symbol = start_symbol + self.debug = debug + rule_tuples = list(rule_tuples) + rule_tuples.append(('$root', [start_symbol, '$end'])) + rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples] + + self.rules = set() + self.rules_by_origin = {o: [] for o, _x, _a in rule_tuples} + for origin, exp, alias in rule_tuples: + r = Rule( origin, exp, alias ) + self.rules.add(r) + self.rules_by_origin[origin].append(r) + + for r in self.rules: + for sym in r.expansion: + if not (is_terminal(sym) or sym in self.rules_by_origin): + raise GrammarError("Using an undefined rule: %s" % sym) + + self.init_state = self.expand_rule(start_symbol) + + self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(self.rules) + + def expand_rule(self, rule): + "Returns all init_ptrs accessible by rule (recursive)" + init_ptrs = set() + def _expand_rule(rule): + assert not is_terminal(rule), rule + + for r in self.rules_by_origin[rule]: + init_ptr = RulePtr(r, 0) + init_ptrs.add(init_ptr) + + if r.expansion: # if not empty rule + new_r = init_ptr.next + if not is_terminal(new_r): + yield new_r + + _ = list(bfs([rule], _expand_rule)) + + return fzset(init_ptrs) + + def _first(self, r): + if is_terminal(r): + return {r} + else: + return {rp.next for rp in self.expand_rule(r) if is_terminal(rp.next)} + diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index e50de18..83f96fc 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -1,162 +1,16 @@ import logging -from collections import defaultdict, deque +from collections import defaultdict from ..utils import classify, classify_bool, bfs, fzset from ..common import GrammarError, is_terminal +from .grammar_analysis import GrammarAnalyzer + ACTION_SHIFT = 0 -class Rule(object): - """ - origin : a symbol - expansion : a list of symbols - """ - def __init__(self, origin, expansion, alias=None): - self.origin = origin - self.expansion = expansion - self.alias = alias - - def __repr__(self): - return '<%s : %s>' % (self.origin, ' '.join(self.expansion)) - -class RulePtr(object): - def __init__(self, rule, index): - assert isinstance(rule, Rule) - assert index <= len(rule.expansion) - self.rule = rule - self.index = index - - def __repr__(self): - before = self.rule.expansion[:self.index] - after = self.rule.expansion[self.index:] - return '<%s : %s * %s>' % (self.rule.origin, ' '.join(before), ' '.join(after)) - - @property - def next(self): - return self.rule.expansion[self.index] - - def advance(self, sym): - assert self.next == sym - return RulePtr(self.rule, self.index+1) - - @property - def is_satisfied(self): - return self.index == len(self.rule.expansion) - - def __eq__(self, other): - return self.rule == other.rule and self.index == other.index - def __hash__(self): - return hash((self.rule, self.index)) - - -def pairs(lst): - return zip(lst[:-1], lst[1:]) - -def update_set(set1, set2): - copy = set(set1) - set1 |= set2 - return set1 != copy - -class GrammarAnalyzer(object): - def __init__(self, rule_tuples, start_symbol, debug=False): - self.start_symbol = start_symbol - self.debug = debug - rule_tuples = list(rule_tuples) - rule_tuples.append(('$root', [start_symbol, '$end'])) - rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples] - - self.rules = set() - self.rules_by_origin = {o: [] for o, _x, _a in rule_tuples} - for origin, exp, alias in rule_tuples: - r = Rule( origin, exp, alias ) - self.rules.add(r) - self.rules_by_origin[origin].append(r) - - for r in self.rules: - for sym in r.expansion: - if not (is_terminal(sym) or sym in self.rules_by_origin): - raise GrammarError("Using an undefined rule: %s" % sym) - - self.init_state = self.expand_rule(start_symbol) - - def expand_rule(self, rule): - "Returns all init_ptrs accessible by rule (recursive)" - init_ptrs = set() - def _expand_rule(rule): - assert not is_terminal(rule) - - for r in self.rules_by_origin[rule]: - init_ptr = RulePtr(r, 0) - init_ptrs.add(init_ptr) - - if r.expansion: # if not empty rule - new_r = init_ptr.next - if not is_terminal(new_r): - yield new_r - - _ = list(bfs([rule], _expand_rule)) - - return fzset(init_ptrs) - - def _first(self, r): - if is_terminal(r): - return {r} - else: - return {rp.next for rp in self.expand_rule(r) if is_terminal(rp.next)} - - def _calc(self): - """Calculate FOLLOW sets. - - Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets""" - symbols = {sym for rule in self.rules for sym in rule.expansion} | {rule.origin for rule in self.rules} - symbols.add('$root') # what about other unused rules? - - # foreach grammar rule X ::= Y(1) ... Y(k) - # if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then - # NULLABLE = NULLABLE union {X} - # for i = 1 to k - # if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then - # FIRST(X) = FIRST(X) union FIRST(Y(i)) - # for j = i+1 to k - # if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then - # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X) - # if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then - # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j)) - # until none of NULLABLE,FIRST,FOLLOW changed in last iteration - - NULLABLE = set() - FIRST = {} - FOLLOW = {} - for sym in symbols: - FIRST[sym]={sym} if is_terminal(sym) else set() - FOLLOW[sym]=set() - - changed = True - while changed: - changed = False - - for rule in self.rules: - if set(rule.expansion) <= NULLABLE: - if update_set(NULLABLE, {rule.origin}): - changed = True - - for i, sym in enumerate(rule.expansion): - if set(rule.expansion[:i]) <= NULLABLE: - if update_set(FIRST[rule.origin], FIRST[sym]): - changed = True - if i==len(rule.expansion)-1 or set(rule.expansion[i:]) <= NULLABLE: - if update_set(FOLLOW[sym], FOLLOW[rule.origin]): - changed = True - - for j in range(i+1, len(rule.expansion)): - if set(rule.expansion[i+1:j]) <= NULLABLE: - if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]): - changed = True - - self.FOLLOW = FOLLOW - - def analyze(self): - self._calc() +class LALR_Analyzer(GrammarAnalyzer): + + def compute_lookahead(self): self.states = {} def step(state): @@ -188,7 +42,8 @@ class GrammarAnalyzer(object): lookahead[k] = [x] for k, v in lookahead.items(): - assert len(v) == 1, ("Collision", k, v) + if not len(v) == 1: + raise GrammarError("Collision in %s: %s" %(k, v)) self.states[state] = {k:v[0] for k, v in lookahead.items()} diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 313d808..7394f91 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -1,15 +1,15 @@ -from .lalr_analysis import ACTION_SHIFT from ..common import ParseError, UnexpectedToken +from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT class Parser(object): - def __init__(self, analysis, callback): - self.analysis = analysis - self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) - for rule in analysis.rules} - self.state = self.analysis.init_state_idx + def __init__(self, parser_conf): + self.analysis = LALR_Analyzer(parser_conf.rules, parser_conf.start) + self.analysis.compute_lookahead() + self.callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) + for rule in self.analysis.rules} - def parse(self, seq, set_state=False): + def parse(self, seq, set_state=None): i = 0 stream = iter(seq) states_idx = self.analysis.states_idx @@ -17,6 +17,8 @@ class Parser(object): state_stack = [self.analysis.init_state_idx] value_stack = [] + if set_state: set_state(self.analysis.init_state_idx) + def get_action(key): state = state_stack[-1] try: @@ -54,7 +56,7 @@ class Parser(object): if action == ACTION_SHIFT: state_stack.append(arg) value_stack.append(token) - if set_state: self.state = arg + if set_state: set_state(arg) token = next(stream) i += 1 else: diff --git a/lark/parsers/nearley.py b/lark/parsers/nearley.py new file mode 100644 index 0000000..b2a511e --- /dev/null +++ b/lark/parsers/nearley.py @@ -0,0 +1,155 @@ +"My name is Earley" + +from ..utils import classify, STRING_TYPE +from ..common import ParseError, UnexpectedToken + +try: + xrange +except NameError: + xrange = range + +class MatchFailed(object): + pass + +class AbortParseMatch(Exception): + pass + + +class Rule(object): + def __init__(self, name, symbols, postprocess): + self.name = name + self.symbols = symbols + self.postprocess = postprocess + +class State(object): + def __init__(self, rule, expect, reference, data=None): + self.rule = rule + self.expect = expect + self.reference = reference + self.data = data or [] + + self.is_complete = (self.expect == len(self.rule.symbols)) + if not self.is_complete: + self.expect_symbol = self.rule.symbols[self.expect] + self.is_terminal = isinstance(self.expect_symbol, tuple) + else: + self.is_terminal = False + + def next_state(self, data): + return State(self.rule, self.expect+1, self.reference, self.data + [data]) + + def consume_terminal(self, inp): + if not self.is_complete and self.is_terminal: + # PORT: originally tests regexp + + if self.expect_symbol[1] is not None: + match = self.expect_symbol[1].match(inp) + if match: + return self.next_state(inp) + + elif self.expect_symbol[0] == inp.type: + return self.next_state(inp) + + def consume_nonterminal(self, inp): + if not self.is_complete and not self.is_terminal: + + if self.expect_symbol == inp: + return self.next_state(inp) + + def process(self, location, ind, table, rules, added_rules): + + if self.is_complete: + # Completed a rule + if self.rule.postprocess: + try: + self.data = self.rule.postprocess(self.data) + except AbortParseMatch: + self.data = MatchFailed + + if self.data is not MatchFailed: + for s in table[self.reference]: + x = s.consume_nonterminal(self.rule.name) + if x: + x.data[-1] = self.data + x.epsilon_closure(location, ind, table) + + else: + exp = self.rule.symbols[self.expect] + if isinstance(exp, tuple): + return + + for r in rules[exp]: + assert r.name == exp + if r not in added_rules: + if r.symbols: + added_rules.add(r) + State(r, 0, location).epsilon_closure(location, ind, table) + else: + # Empty rule + new_copy = self.consume_nonterminal(r.name) + new_copy.data[-1] = r.postprocess([]) if r.postprocess else [] + + new_copy.epsilon_closure(location, ind, table) + + def epsilon_closure(self, location, ind, table): + col = table[location] + col.append(self) + + if not self.is_complete: + for i in xrange(ind): + state = col[i] + if state.is_complete and state.reference == location: + x = self.consume_nonterminal(state.rule.name) + if x: + x.data[-1] = state.data + x.epsilon_closure(location, ind, table) + + +class Parser(object): + def __init__(self, rules, start=None): + self.rules = [Rule(r['name'], r['symbols'], r.get('postprocess', None)) for r in rules] + self.rules_by_name = classify(self.rules, lambda r: r.name) + self.start = start or self.rules[0].name + + def advance_to(self, table, added_rules): + n = len(table)-1 + for w, s in enumerate(table[n]): + s.process(n, w, table, self.rules_by_name, added_rules) + + def parse(self, stream): + initial_rules = set(self.rules_by_name[self.start]) + table = [[State(r, 0, 0) for r in initial_rules]] + self.advance_to(table, initial_rules) + + i = 0 + + while i < len(stream): + col = [] + + token = stream[i] + for s in table[-1]: + x = s.consume_terminal(token) + if x: + col.append(x) + + if not col: + expected = {s.expect_symbol for s in table[-1] if s.is_terminal} + raise UnexpectedToken(stream[i], expected, stream, i) + + table.append(col) + self.advance_to(table, set()) + + i += 1 + + res = list(self.finish(table)) + if not res: + raise ParseError('Incomplete parse') + return res + + def finish(self, table): + for t in table[-1]: + if (t.rule.name == self.start + and t.expect == len(t.rule.symbols) + and t.reference == 0 + and t.data is not MatchFailed): + yield t.data