Improvements and cleanup to the earley parser

8 years ago · fee18a8d8a
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -1,3 +1,5 @@
 import re

 from .lexer import Lexer
 from .parsers.lalr_analysis import GrammarAnalyzer

@@ -33,22 +35,46 @@ class Earley(WithLexer):
        WithLexer.__init__(self, lexer_conf)

        rules = [{'name':n,
                  'symbols': self._process_expansion(x),
                  'symbols': list(self._prepare_expansion(x)),
                  'postprocess': getattr(parser_conf.callback, a)}
                  for n,x,a in parser_conf.rules]

        self.parser = earley.Parser(rules, parser_conf.start)

    def _prepare_expansion(self, expansion):
        for sym in expansion:
            if is_terminal(sym):
                yield sym, None
            else:
                yield sym

    def parse(self, text):
        tokens = list(self.lex(text))
        res = self.parser.parse(tokens)
        assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
        return res[0]

    @staticmethod
    def _process_expansion(x):
        return [{'literal': s} if is_terminal(s) else s for s in x]
 class Earley2:
    def __init__(self, lexer_conf, parser_conf):
        self.token_by_name = {t.name:t for t in lexer_conf.tokens}

        rules = [{'name':n,
                  'symbols': list(self._prepare_expansion(x)),
                  'postprocess': getattr(parser_conf.callback, a)}
                  for n,x,a in parser_conf.rules]

        self.parser = earley.Parser(rules, parser_conf.start)

    def _prepare_expansion(self, expansion):
        for sym in expansion:
            if is_terminal(sym):
                yield sym, re.compile(self.token_by_name[sym].to_regexp())
            else:
                yield sym

    def parse(self, text):
        res = self.parser.parse(text)
        assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
        return res[0]             

 ENGINE_DICT = { 'lalr': LALR, 'earley': Earley }
--- a/lark/parsers/earley.py
+++ b/lark/parsers/earley.py
@@ -31,37 +31,35 @@ class State(object):
        self.is_complete = (self.expect == len(self.rule.symbols))
        if not self.is_complete:
            self.expect_symbol = self.rule.symbols[self.expect]
            self.is_literal = isinstance(self.expect_symbol, dict)
            if self.is_literal:
                self.expect_symbol = self.expect_symbol['literal']
            assert isinstance(self.expect_symbol, STRING_TYPE), self.expect_symbol
            self.is_terminal = isinstance(self.expect_symbol, tuple)
        else:
            self.is_literal = False
            self.is_terminal = False

    def next_state(self, data):
        return State(self.rule, self.expect+1, self.reference, self.data + [data])

    def consume_terminal(self, inp):
        if not self.is_complete and self.is_literal:
        if not self.is_complete and self.is_terminal:
            # PORT: originally tests regexp

            if self.expect_symbol == inp.type:
            if self.expect_symbol[1] is not None:
                match = self.expect_symbol[1].match(stream, pos)

            if self.expect_symbol[0] == inp.type:
                return self.next_state(inp)

    def consume_nonterminal(self, inp):
        if not self.is_complete and not self.is_literal:
        if not self.is_complete and not self.is_terminal:

            if self.expect_symbol == inp:
                return self.next_state(inp)

    def process(self, location, ind, table, rules, added_rules):

        if self.is_complete:
            # Completed a rule
            if self.rule.postprocess:
                try:
                    # self.data = self.rule.postprocess(self.data, self.reference)
                    # import pdb
                    # pdb.set_trace()
                    self.data = self.rule.postprocess(self.data)
                except AbortParseMatch:
                    self.data = MatchFailed
@@ -75,7 +73,7 @@ class State(object):

        else:
            exp = self.rule.symbols[self.expect]
            if isinstance(exp, dict):
            if isinstance(exp, tuple):
                return

            for r in rules[exp]:
@@ -87,19 +85,13 @@ class State(object):
                    else:
                        # Empty rule
                        new_copy = self.consume_nonterminal(r.name)
                        if r.postprocess:
                            new_copy.data[-1] = r.postprocess([])
                        else:
                            new_copy.data[-1] = []
                        new_copy.data[-1] = r.postprocess([]) if r.postprocess else []

                        new_copy.epsilon_closure(location, ind, table)

    def epsilon_closure(self, location, ind, table, result=None):
    def epsilon_closure(self, location, ind, table):
        col = table[location]
        if not result:
            result = col

        result.append(self)
        col.append(self)

        if not self.is_complete:
            for i in xrange(ind):
@@ -117,29 +109,35 @@ class Parser(object):
        self.rules_by_name = classify(self.rules, lambda r: r.name)
        self.start = start or self.rules[0].name

    def advance_to(self, table, n, added_rules):
    def advance_to(self, table, added_rules):
        n = len(table)-1
        for w, s in enumerate(table[n]):
            s.process(n, w, table, self.rules_by_name, added_rules)

    def parse(self, stream):
        initial_rules = set(self.rules_by_name[self.start])
        table = [[State(r, 0, 0) for r in initial_rules]]
        self.advance_to(table, 0, initial_rules)
        self.advance_to(table, initial_rules)

        i = 0

        for pos, token in enumerate(stream):
            table.append([])
        while i < len(stream):
            col = []

            for s in table[pos]:
            token = stream[i]
            for s in table[-1]:
                x = s.consume_terminal(token)
                if x:
                    table[pos + 1].append(x)
                    col.append(x)

            if not col:
                expected = {s.expect_symbol for s in table[-1] if s.is_terminal}
                raise UnexpectedToken(stream[i], expected, stream, i)

            self.advance_to(table, pos + 1, set())
            table.append(col)
            self.advance_to(table, set())

            if not table[-1]:
                expected = {s.expect_symbol for s in table[-2] if s.is_literal}
                raise UnexpectedToken(stream[pos], expected, stream, pos)
            i += 1

        res = list(self.finish(table))
        if not res: