diff --git a/lark/common.py b/lark/common.py index 4bf04ec..efbab01 100644 --- a/lark/common.py +++ b/lark/common.py @@ -5,7 +5,7 @@ from .lexer import TerminalDef class LexerConf(Serialize): - __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes' + __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes', 'name' __serialize_namespace__ = TerminalDef, def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): @@ -18,12 +18,18 @@ class LexerConf(Serialize): self.skip_validation = skip_validation self.use_bytes = use_bytes -###} + self.name = None + +class ParserConf(Serialize): + __serialize_fields__ = 'rules', 'start', 'name' -class ParserConf: def __init__(self, rules, callbacks, start): assert isinstance(start, list) self.rules = rules self.callbacks = callbacks self.start = start + + self.name = None + +###} diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 70fd7eb..36bf849 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -11,7 +11,7 @@ from .utils import bfs, Py36, logger, classify_bool from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder -from .parser_frontends import LALR_TraditionalLexer +from .parser_frontends import ParsingFrontend from .common import LexerConf, ParserConf from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify, suppress, dedup_list, Str @@ -883,9 +883,10 @@ class GrammarLoader: callback = ParseTreeBuilder(rules, ST).create_callback() import re lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT']) - parser_conf = ParserConf(rules, callback, ['start']) - self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf) + lexer_conf.name = 'standard' + parser_conf.name = 'lalr' + self.parser = ParsingFrontend(lexer_conf, parser_conf, {}) self.canonize_tree = CanonizeTree() self.global_keep_all_tokens = global_keep_all_tokens diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 5d32589..4061811 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -4,9 +4,8 @@ from .parsers.grammar_analysis import GrammarAnalyzer from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef from .parsers import earley, xearley, cyk from .parsers.lalr_parser import LALR_Parser -from .grammar import Rule from .tree import Tree -from .common import LexerConf +from .common import LexerConf, ParserConf try: import regex except ImportError: @@ -27,56 +26,112 @@ def _wrap_lexer(lexer_class): return self.lexer.lex(lexer_state.text) return CustomLexerWrapper + +class MakeParsingFrontend: + def __init__(self, parser, lexer): + self.parser = parser + self.lexer = lexer + + def __call__(self, lexer_conf, parser_conf, options): + assert isinstance(lexer_conf, LexerConf) + assert isinstance(parser_conf, ParserConf) + parser_conf.name = self.parser + lexer_conf.name = self.lexer + return ParsingFrontend(lexer_conf, parser_conf, options) + + @classmethod + def deserialize(cls, data, memo, callbacks, options): + lexer_conf = LexerConf.deserialize(data['lexer_conf'], memo) + parser_conf = ParserConf.deserialize(data['parser_conf'], memo) + parser = LALR_Parser.deserialize(data['parser'], memo, callbacks, options.debug) + parser_conf.callbacks = callbacks + + terminals = [item for item in memo.values() if isinstance(item, TerminalDef)] + + lexer_conf.callbacks = _get_lexer_callbacks(options.transformer, terminals) + lexer_conf.re_module = regex if options.regex else re + lexer_conf.use_bytes = options.use_bytes + lexer_conf.g_regex_flags = options.g_regex_flags + lexer_conf.skip_validation = True + lexer_conf.postlex = options.postlex + + return ParsingFrontend(lexer_conf, parser_conf, options, parser=parser) + + + + +class ParsingFrontend(Serialize): + __serialize_fields__ = 'lexer_conf', 'parser_conf', 'parser', 'options' + + def __init__(self, lexer_conf, parser_conf, options, parser=None): + self.parser_conf = parser_conf + self.lexer_conf = lexer_conf + self.options = options + + # Set-up parser + if parser: # From cache + self.parser = parser + else: + create_parser = { + 'lalr': create_lalr_parser, + 'earley': make_early, + 'cyk': CYK_FrontEnd, + }[parser_conf.name] + self.parser = create_parser(lexer_conf, parser_conf, options) + + # Set-up lexer + self.skip_lexer = False + if lexer_conf.name in ('dynamic', 'dynamic_complete'): + self.skip_lexer = True + return + + try: + create_lexer = { + 'standard': create_traditional_lexer, + 'contextual': create_contextual_lexer, + }[lexer_conf.name] + except KeyError: + assert issubclass(lexer_conf.name, Lexer), lexer_conf.name + self.lexer = _wrap_lexer(lexer_conf.name)(lexer_conf) + else: + self.lexer = create_lexer(lexer_conf, self.parser, lexer_conf.postlex) + + if lexer_conf.postlex: + self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex) + + + def _parse(self, start, input, *args): + if start is None: + start = self.parser_conf.start + if len(start) > 1: + raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start) + start ,= start + return self.parser.parse(input, start, *args) + + def parse(self, text, start=None): + if self.skip_lexer: + return self._parse(start, text) + + lexer = LexerThread(self.lexer, text) + return self._parse(start, lexer) + + def get_frontend(parser, lexer): if parser=='lalr': if lexer is None: raise ConfigurationError('The LALR parser requires use of a lexer') - elif lexer == 'standard': - return LALR_TraditionalLexer - elif lexer == 'contextual': - return LALR_ContextualLexer - elif issubclass(lexer, Lexer): - wrapped = _wrap_lexer(lexer) - class LALR_CustomLexerWrapper(LALR_WithLexer): - def init_lexer(self): - self.lexer = wrapped(self.lexer_conf) - return LALR_CustomLexerWrapper - else: + if lexer not in ('standard' ,'contextual') and not issubclass(lexer, Lexer): raise ConfigurationError('Unknown lexer: %s' % lexer) elif parser=='earley': - if lexer=='standard': - return Earley_Traditional - elif lexer=='dynamic': - return XEarley - elif lexer=='dynamic_complete': - return XEarley_CompleteLex - elif lexer=='contextual': + if lexer=='contextual': raise ConfigurationError('The Earley parser does not support the contextual parser') - elif issubclass(lexer, Lexer): - wrapped = _wrap_lexer(lexer) - class Earley_CustomLexerWrapper(Earley_WithLexer): - def init_lexer(self, **kw): - self.lexer = wrapped(self.lexer_conf) - return Earley_CustomLexerWrapper - else: - raise ConfigurationError('Unknown lexer: %s' % lexer) elif parser == 'cyk': - if lexer == 'standard': - return CYK - else: + if lexer != 'standard': raise ConfigurationError('CYK parser requires using standard parser.') else: raise ConfigurationError('Unknown parser: %s' % parser) - -class _ParserFrontend(Serialize): - def _parse(self, start, input, *args): - if start is None: - start = self.start - if len(start) > 1: - raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start) - start ,= start - return self.parser.parse(input, start, *args) + return MakeParsingFrontend(parser, lexer) def _get_lexer_callbacks(transformer, terminals): @@ -100,119 +155,26 @@ class PostLexConnector: return self.postlexer.process(i) -class WithLexer(_ParserFrontend): - lexer = None - parser = None - lexer_conf = None - start = None - - __serialize_fields__ = 'parser', 'lexer_conf', 'start' - __serialize_namespace__ = LexerConf, - - def __init__(self, lexer_conf, parser_conf, options=None): - self.lexer_conf = lexer_conf - self.start = parser_conf.start - self.postlex = lexer_conf.postlex - - @classmethod - def deserialize(cls, data, memo, callbacks, options): - inst = super(WithLexer, cls).deserialize(data, memo) - - inst.postlex = options.postlex - inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks, options.debug) - - terminals = [item for item in memo.values() if isinstance(item, TerminalDef)] - inst.lexer_conf.callbacks = _get_lexer_callbacks(options.transformer, terminals) - inst.lexer_conf.re_module = regex if options.regex else re - inst.lexer_conf.use_bytes = options.use_bytes - inst.lexer_conf.g_regex_flags = options.g_regex_flags - inst.lexer_conf.skip_validation = True - inst.init_lexer() - - return inst - - def _serialize(self, data, memo): - data['parser'] = data['parser'].serialize(memo) - - def make_lexer(self, text): - lexer = self.lexer - if self.postlex: - lexer = PostLexConnector(self.lexer, self.postlex) - return LexerThread(lexer, text) - - def parse(self, text, start=None): - return self._parse(start, self.make_lexer(text)) - - def init_traditional_lexer(self): - self.lexer = TraditionalLexer(self.lexer_conf) -class LALR_WithLexer(WithLexer): - def __init__(self, lexer_conf, parser_conf, options=None): - debug = options.debug if options else False - self.parser = LALR_Parser(parser_conf, debug=debug) - WithLexer.__init__(self, lexer_conf, parser_conf, options) +def create_traditional_lexer(lexer_conf, parser, postlex): + return TraditionalLexer(lexer_conf) - self.init_lexer() +def create_contextual_lexer(lexer_conf, parser, postlex): + states = {idx:list(t.keys()) for idx, t in parser._parse_table.states.items()} + always_accept = postlex.always_accept if postlex else () + return ContextualLexer(lexer_conf, states, always_accept=always_accept) - def init_lexer(self, **kw): - raise NotImplementedError() +def create_lalr_parser(lexer_conf, parser_conf, options=None): + debug = options.debug if options else False + return LALR_Parser(parser_conf, debug=debug) -class LALR_TraditionalLexer(LALR_WithLexer): - def init_lexer(self): - self.init_traditional_lexer() - -class LALR_ContextualLexer(LALR_WithLexer): - def init_lexer(self): - states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} - always_accept = self.postlex.always_accept if self.postlex else () - self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept) +make_early = NotImplemented +CYK_FrontEnd = NotImplemented ###} - -class Earley_WithLexer(WithLexer): - def __init__(self, lexer_conf, parser_conf, options=None): - WithLexer.__init__(self, lexer_conf, parser_conf, options) - self.init_lexer() - - resolve_ambiguity = options.ambiguity == 'resolve' - debug = options.debug if options else False - tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None - self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class) - - def match(self, term, token): - return term.name == token.type - - def init_lexer(self, **kw): - raise NotImplementedError() - -class Earley_Traditional(Earley_WithLexer): - def init_lexer(self, **kw): - self.init_traditional_lexer() - - -class XEarley(_ParserFrontend): - def __init__(self, lexer_conf, parser_conf, options=None, **kw): - self.token_by_name = {t.name:t for t in lexer_conf.tokens} - self.start = parser_conf.start - - self._prepare_match(lexer_conf) - resolve_ambiguity = options.ambiguity == 'resolve' - debug = options.debug if options else False - tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None - self.parser = xearley.Parser(parser_conf, - self.match, - ignore=lexer_conf.ignore, - resolve_ambiguity=resolve_ambiguity, - debug=debug, - tree_class=tree_class, - **kw - ) - - def match(self, term, text, index=0): - return self.regexps[term.name].match(text, index) - - def _prepare_match(self, lexer_conf): +class EarleyRegexpMatcher: + def __init__(self, lexer_conf): self.regexps = {} for t in lexer_conf.tokens: if t.priority != 1: @@ -230,31 +192,49 @@ class XEarley(_ParserFrontend): self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) - def parse(self, text, start): - return self._parse(start, text) + def match(self, term, text, index=0): + return self.regexps[term.name].match(text, index) -class XEarley_CompleteLex(XEarley): - def __init__(self, *args, **kw): - XEarley.__init__(self, *args, complete_lex=True, **kw) +def make_xearley(lexer_conf, parser_conf, options=None, **kw): + earley_matcher = EarleyRegexpMatcher(lexer_conf) + return xearley.Parser(parser_conf, earley_matcher.match, ignore=lexer_conf.ignore, **kw) +def _match_earley_basic(term, token): + return term.name == token.type -class CYK(WithLexer): +def make_early_basic(lexer_conf, parser_conf, options, **kw): + return earley.Parser(parser_conf, _match_earley_basic, **kw) - def __init__(self, lexer_conf, parser_conf, options=None): - WithLexer.__init__(self, lexer_conf, parser_conf, options) - self.init_traditional_lexer() +def make_early(lexer_conf, parser_conf, options): + resolve_ambiguity = options.ambiguity == 'resolve' + debug = options.debug if options else False + tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None + + extra = {} + if lexer_conf.name == 'dynamic': + f = make_xearley + elif lexer_conf.name == 'dynamic_complete': + extra['complete_lex'] =True + f = make_xearley + else: + f = make_early_basic + return f(lexer_conf, parser_conf, options, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class, **extra) + + + +class CYK_FrontEnd: + def __init__(self, lexer_conf, parser_conf, options=None): self._analysis = GrammarAnalyzer(parser_conf) self.parser = cyk.Parser(parser_conf.rules) self.callbacks = parser_conf.callbacks - def parse(self, text, start): - tokens = list(self.make_lexer(text).lex(None)) - parse = self._parse(start, tokens) - parse = self._transform(parse) - return parse + def parse(self, lexer, start): + tokens = list(lexer.lex(None)) + tree = self.parser.parse(tokens, start) + return self._transform(tree) def _transform(self, tree): subtrees = list(tree.iter_subtrees()) diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 3d006e7..f7ff8fe 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -5,13 +5,14 @@ from copy import deepcopy, copy from ..exceptions import UnexpectedInput, UnexpectedToken from ..lexer import Token +from ..utils import Serialize from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable from .lalr_puppet import ParserPuppet ###{standalone -class LALR_Parser(object): +class LALR_Parser(Serialize): def __init__(self, parser_conf, debug=False): analysis = LALR_Analyzer(parser_conf, debug=debug) analysis.compute_lalr() diff --git a/lark/utils.py b/lark/utils.py index 366922b..3b5b8a8 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -302,4 +302,5 @@ def _serialize(value, memo): return list(value) # TODO reversible? elif isinstance(value, dict): return {key:_serialize(elem, memo) for key, elem in value.items()} + # assert value is None or isinstance(value, (int, float, str, tuple)), value return value