Merge pull request #781 from lark-parser/refactor_frontends

Refactored parser_frontends. Now significantly simpler
5 years ago · 692a950488
--- a/lark/common.py
+++ b/lark/common.py
@@ -5,7 +5,7 @@ from .lexer import TerminalDef


 class LexerConf(Serialize):
    __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes'
    __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type'
    __serialize_namespace__ = TerminalDef,

    def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False):
@@ -18,12 +18,18 @@ class LexerConf(Serialize):
        self.skip_validation = skip_validation
        self.use_bytes = use_bytes

 ###}
        self.lexer_type = None


 class ParserConf(Serialize):
    __serialize_fields__ = 'rules', 'start', 'parser_type'

 class ParserConf:
    def __init__(self, rules, callbacks, start):
        assert isinstance(start, list)
        self.rules = rules
        self.callbacks = callbacks
        self.start = start

        self.parser_type = None

 ###}
--- a/lark/exceptions.py
+++ b/lark/exceptions.py
@@ -11,6 +11,11 @@ class ConfigurationError(LarkError, ValueError):
    pass


 def assert_config(value, options, msg='Got %r, expected one of %s'):
    if value not in options:
        raise ConfigurationError(msg % (value, options))


 class GrammarError(LarkError):
    pass

@@ -198,4 +203,6 @@ class VisitError(LarkError):

        message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
        super(VisitError, self).__init__(message)


 ###}
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import
 from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken, ConfigurationError
 from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken, ConfigurationError, assert_config

 import sys, os, pickle, hashlib
 from io import open
@@ -24,10 +24,6 @@ except ImportError:

 ###{standalone

 def assert_config(value, options, msg='Got %r, expected one of %s'):
    if value not in options:
        raise ConfigurationError(msg % (value, options))


 class LarkOptions(Serialize):
    """Specifies the options for Lark
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -11,7 +11,7 @@ from .utils import bfs, Py36, logger, classify_bool
 from .lexer import Token, TerminalDef, PatternStr, PatternRE

 from .parse_tree_builder import ParseTreeBuilder
 from .parser_frontends import LALR_TraditionalLexer
 from .parser_frontends import ParsingFrontend
 from .common import LexerConf, ParserConf
 from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
 from .utils import classify, suppress, dedup_list, Str
@@ -883,9 +883,10 @@ class GrammarLoader:
        callback = ParseTreeBuilder(rules, ST).create_callback()
        import re
        lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT'])

        parser_conf = ParserConf(rules, callback, ['start'])
        self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)
        lexer_conf.lexer_type = 'standard'
        parser_conf.parser_type = 'lalr'
        self.parser = ParsingFrontend(lexer_conf, parser_conf, {})

        self.canonize_tree = CanonizeTree()
        self.global_keep_all_tokens = global_keep_all_tokens
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -1,12 +1,11 @@
 from .exceptions import ConfigurationError, GrammarError
 from .exceptions import ConfigurationError, GrammarError, assert_config
 from .utils import get_regexp_width, Serialize
 from .parsers.grammar_analysis import GrammarAnalyzer
 from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef
 from .parsers import earley, xearley, cyk
 from .parsers.lalr_parser import LALR_Parser
 from .grammar import Rule
 from .tree import Tree
 from .common import LexerConf
 from .common import LexerConf, ParserConf
 try:
    import regex
 except ImportError:
@@ -27,56 +26,106 @@ def _wrap_lexer(lexer_class):
                return self.lexer.lex(lexer_state.text)
        return CustomLexerWrapper

 def get_frontend(parser, lexer):
    if parser=='lalr':
        if lexer is None:
            raise ConfigurationError('The LALR parser requires use of a lexer')
        elif lexer == 'standard':
            return LALR_TraditionalLexer
        elif lexer == 'contextual':
            return LALR_ContextualLexer
        elif issubclass(lexer, Lexer):
            wrapped = _wrap_lexer(lexer)
            class LALR_CustomLexerWrapper(LALR_WithLexer):
                def init_lexer(self):
                    self.lexer = wrapped(self.lexer_conf)
            return LALR_CustomLexerWrapper
        else:
            raise ConfigurationError('Unknown lexer: %s' % lexer)
    elif parser=='earley':
        if lexer=='standard':
            return Earley_Traditional
        elif lexer=='dynamic':
            return XEarley
        elif lexer=='dynamic_complete':
            return XEarley_CompleteLex
        elif lexer=='contextual':
            raise ConfigurationError('The Earley parser does not support the contextual parser')
        elif issubclass(lexer, Lexer):
            wrapped = _wrap_lexer(lexer)
            class Earley_CustomLexerWrapper(Earley_WithLexer):
                def init_lexer(self, **kw):
                    self.lexer = wrapped(self.lexer_conf)
            return Earley_CustomLexerWrapper

 class MakeParsingFrontend:
    def __init__(self, parser_type, lexer_type):
        self.parser_type = parser_type
        self.lexer_type = lexer_type

    def __call__(self, lexer_conf, parser_conf, options):
        assert isinstance(lexer_conf, LexerConf)
        assert isinstance(parser_conf, ParserConf)
        parser_conf.parser_type = self.parser_type
        lexer_conf.lexer_type = self.lexer_type
        return ParsingFrontend(lexer_conf, parser_conf, options)

    @classmethod
    def deserialize(cls, data, memo, callbacks, options):
        lexer_conf = LexerConf.deserialize(data['lexer_conf'], memo)
        parser_conf = ParserConf.deserialize(data['parser_conf'], memo)
        parser = LALR_Parser.deserialize(data['parser'], memo, callbacks, options.debug)
        parser_conf.callbacks = callbacks

        terminals = [item for item in memo.values() if isinstance(item, TerminalDef)]

        lexer_conf.callbacks = _get_lexer_callbacks(options.transformer, terminals)
        lexer_conf.re_module = regex if options.regex else re
        lexer_conf.use_bytes = options.use_bytes
        lexer_conf.g_regex_flags = options.g_regex_flags
        lexer_conf.skip_validation = True
        lexer_conf.postlex = options.postlex

        return ParsingFrontend(lexer_conf, parser_conf, options, parser=parser)




 class ParsingFrontend(Serialize):
    __serialize_fields__ = 'lexer_conf', 'parser_conf', 'parser', 'options'

    def __init__(self, lexer_conf, parser_conf, options, parser=None):
        self.parser_conf = parser_conf
        self.lexer_conf = lexer_conf
        self.options = options

        # Set-up parser
        if parser:  # From cache
            self.parser = parser
        else:
            raise ConfigurationError('Unknown lexer: %s' % lexer)
    elif parser == 'cyk':
        if lexer == 'standard':
            return CYK
            create_parser = {
                'lalr': create_lalr_parser,
                'earley': create_earley_parser,
                'cyk': CYK_FrontEnd,
            }[parser_conf.parser_type]
            self.parser = create_parser(lexer_conf, parser_conf, options)

        # Set-up lexer
        lexer_type = lexer_conf.lexer_type
        self.skip_lexer = False
        if lexer_type in ('dynamic', 'dynamic_complete'):
            self.skip_lexer = True
            return

        try:
            create_lexer = {
                'standard': create_traditional_lexer,
                'contextual': create_contextual_lexer,
            }[lexer_type]
        except KeyError:
            assert issubclass(lexer_type, Lexer), lexer_type
            self.lexer = _wrap_lexer(lexer_type)(lexer_conf)
        else:
            raise ConfigurationError('CYK parser requires using standard parser.')
    else:
        raise ConfigurationError('Unknown parser: %s' % parser)
            self.lexer = create_lexer(lexer_conf, self.parser, lexer_conf.postlex)

        if lexer_conf.postlex:
            self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex)

 class _ParserFrontend(Serialize):
    def _parse(self, start, input, *args):

    def parse(self, text, start=None):
        if start is None:
            start = self.start
            start = self.parser_conf.start
            if len(start) > 1:
                raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start)
            start ,= start
        return self.parser.parse(input, start, *args)

        if self.skip_lexer:
            return self.parser.parse(text, start)

        lexer_thread = LexerThread(self.lexer, text)
        return self.parser.parse(lexer_thread, start)


 def get_frontend(parser, lexer):
    assert_config(parser, ('lalr', 'earley', 'cyk'))
    if not isinstance(lexer, type):     # not custom lexer?
        expected = {
            'lalr': ('standard', 'contextual'),
            'earley': ('standard', 'dynamic', 'dynamic_complete'),
            'cyk': ('standard', ),
         }[parser]
        assert_config(lexer, expected, 'Parser %r does not support lexer %%r, expected one of %%s' % parser)

    return MakeParsingFrontend(parser, lexer)


 def _get_lexer_callbacks(transformer, terminals):
@@ -100,119 +149,26 @@ class PostLexConnector:
        return self.postlexer.process(i)


 class WithLexer(_ParserFrontend):
    lexer = None
    parser = None
    lexer_conf = None
    start = None

    __serialize_fields__ = 'parser', 'lexer_conf', 'start'
    __serialize_namespace__ = LexerConf,

    def __init__(self, lexer_conf, parser_conf, options=None):
        self.lexer_conf = lexer_conf
        self.start = parser_conf.start
        self.postlex = lexer_conf.postlex

    @classmethod
    def deserialize(cls, data, memo, callbacks, options):
        inst = super(WithLexer, cls).deserialize(data, memo)

        inst.postlex = options.postlex
        inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks, options.debug)

        terminals = [item for item in memo.values() if isinstance(item, TerminalDef)]
        inst.lexer_conf.callbacks = _get_lexer_callbacks(options.transformer, terminals)
        inst.lexer_conf.re_module = regex if options.regex else re
        inst.lexer_conf.use_bytes = options.use_bytes
        inst.lexer_conf.g_regex_flags = options.g_regex_flags
        inst.lexer_conf.skip_validation = True
        inst.init_lexer()

        return inst

    def _serialize(self, data, memo):
        data['parser'] = data['parser'].serialize(memo)

    def make_lexer(self, text):
        lexer = self.lexer
        if self.postlex:
            lexer = PostLexConnector(self.lexer, self.postlex)
        return LexerThread(lexer, text)

    def parse(self, text, start=None):
        return self._parse(start, self.make_lexer(text))

    def init_traditional_lexer(self):
        self.lexer = TraditionalLexer(self.lexer_conf)

 class LALR_WithLexer(WithLexer):
    def __init__(self, lexer_conf, parser_conf, options=None):
        debug = options.debug if options else False
        self.parser = LALR_Parser(parser_conf, debug=debug)
        WithLexer.__init__(self, lexer_conf, parser_conf, options)

        self.init_lexer()
 def create_traditional_lexer(lexer_conf, parser, postlex):
    return TraditionalLexer(lexer_conf)

    def init_lexer(self, **kw):
        raise NotImplementedError()
 def create_contextual_lexer(lexer_conf, parser, postlex):
    states = {idx:list(t.keys()) for idx, t in parser._parse_table.states.items()}
    always_accept = postlex.always_accept if postlex else ()
    return ContextualLexer(lexer_conf, states, always_accept=always_accept)

 class LALR_TraditionalLexer(LALR_WithLexer):
    def init_lexer(self):
        self.init_traditional_lexer()
 def create_lalr_parser(lexer_conf, parser_conf, options=None):
    debug = options.debug if options else False
    return LALR_Parser(parser_conf, debug=debug)

 class LALR_ContextualLexer(LALR_WithLexer):
    def init_lexer(self):
        states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
        always_accept = self.postlex.always_accept if self.postlex else ()
        self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept)

 create_earley_parser = NotImplemented
 CYK_FrontEnd = NotImplemented
 ###}


 class Earley_WithLexer(WithLexer):
    def __init__(self, lexer_conf, parser_conf, options=None):
        WithLexer.__init__(self, lexer_conf, parser_conf, options)
        self.init_lexer()

        resolve_ambiguity = options.ambiguity == 'resolve'
        debug = options.debug if options else False
        tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None
        self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class)

    def match(self, term, token):
        return term.name == token.type

    def init_lexer(self, **kw):
        raise NotImplementedError()

 class Earley_Traditional(Earley_WithLexer):
    def init_lexer(self, **kw):
        self.init_traditional_lexer()


 class XEarley(_ParserFrontend):
    def __init__(self, lexer_conf, parser_conf, options=None, **kw):
        self.token_by_name = {t.name:t for t in lexer_conf.tokens}
        self.start = parser_conf.start

        self._prepare_match(lexer_conf)
        resolve_ambiguity = options.ambiguity == 'resolve'
        debug = options.debug if options else False
        tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None
        self.parser = xearley.Parser(parser_conf,
                                    self.match,
                                    ignore=lexer_conf.ignore,
                                    resolve_ambiguity=resolve_ambiguity,
                                    debug=debug,
                                    tree_class=tree_class,
                                    **kw
                                    )

    def match(self, term, text, index=0):
        return self.regexps[term.name].match(text, index)

    def _prepare_match(self, lexer_conf):
 class EarleyRegexpMatcher:
    def __init__(self, lexer_conf):
        self.regexps = {}
        for t in lexer_conf.tokens:
            if t.priority != 1:
@@ -230,31 +186,49 @@ class XEarley(_ParserFrontend):

            self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)

    def parse(self, text, start):
        return self._parse(start, text)
    def match(self, term, text, index=0):
        return self.regexps[term.name].match(text, index)

 class XEarley_CompleteLex(XEarley):
    def __init__(self, *args, **kw):
        XEarley.__init__(self, *args, complete_lex=True, **kw)

 def create_earley_parser__dynamic(lexer_conf, parser_conf, options=None, **kw):
        earley_matcher = EarleyRegexpMatcher(lexer_conf)
        return xearley.Parser(parser_conf, earley_matcher.match, ignore=lexer_conf.ignore, **kw)

 def _match_earley_basic(term, token):
    return term.name == token.type

 class CYK(WithLexer):
 def create_earley_parser__basic(lexer_conf, parser_conf, options, **kw):
    return earley.Parser(parser_conf, _match_earley_basic, **kw)

    def __init__(self, lexer_conf, parser_conf, options=None):
        WithLexer.__init__(self, lexer_conf, parser_conf, options)
        self.init_traditional_lexer()
 def create_earley_parser(lexer_conf, parser_conf, options):
    resolve_ambiguity = options.ambiguity == 'resolve'
    debug = options.debug if options else False
    tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None

    extra = {}
    if lexer_conf.lexer_type == 'dynamic':
        f = create_earley_parser__dynamic
    elif lexer_conf.lexer_type == 'dynamic_complete':
        extra['complete_lex'] =True
        f = create_earley_parser__dynamic
    else:
        f = create_earley_parser__basic

    return f(lexer_conf, parser_conf, options, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class, **extra)



 class CYK_FrontEnd:
    def __init__(self, lexer_conf, parser_conf, options=None):
        self._analysis = GrammarAnalyzer(parser_conf)
        self.parser = cyk.Parser(parser_conf.rules)

        self.callbacks = parser_conf.callbacks

    def parse(self, text, start):
        tokens = list(self.make_lexer(text).lex(None))
        parse = self._parse(start, tokens)
        parse = self._transform(parse)
        return parse
    def parse(self, lexer_thread, start):
        tokens = list(lexer_thread.lex(None))
        tree = self.parser.parse(tokens, start)
        return self._transform(tree)

    def _transform(self, tree):
        subtrees = list(tree.iter_subtrees())
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -5,13 +5,14 @@
 from copy import deepcopy, copy
 from ..exceptions import UnexpectedInput, UnexpectedToken
 from ..lexer import Token
 from ..utils import Serialize

 from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
 from .lalr_puppet import ParserPuppet

 ###{standalone

 class LALR_Parser(object):
 class LALR_Parser(Serialize):
    def __init__(self, parser_conf, debug=False):
        analysis = LALR_Analyzer(parser_conf, debug=debug)
        analysis.compute_lalr()
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -302,4 +302,5 @@ def _serialize(value, memo):
        return list(value)  # TODO reversible?
    elif isinstance(value, dict):
        return {key:_serialize(elem, memo) for key, elem in value.items()}
    # assert value is None or isinstance(value, (int, float, str, tuple)), value
    return value
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -2471,6 +2471,7 @@ _TO_TEST = [
        ('contextual', 'lalr'),

        ('custom_new', 'lalr'),
        ('custom_new', 'cyk'),
        ('custom_old', 'earley'),
 ]