diff --git a/lark/common.py b/lark/common.py index 9c18972..e1ec220 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,12 +1,23 @@ +from .utils import Serialize +from .lexer import TerminalDef +###{standalone + +class LexerConf(Serialize): + __serialize_fields__ = 'tokens', 'ignore' + __serialize_namespace__ = TerminalDef, -class LexerConf: def __init__(self, tokens, ignore=(), postlex=None, callbacks=None): self.tokens = tokens self.ignore = ignore self.postlex = postlex self.callbacks = callbacks or {} + def _deserialize(self): + self.callbacks = {} # TODO + +###} + class ParserConf: def __init__(self, rules, callbacks, start): self.rules = rules diff --git a/lark/lexer.py b/lark/lexer.py index 6f94a1e..bdf635d 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -261,7 +261,7 @@ def _regexp_has_newline(r): """ return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r) -class Lexer(Serialize): +class Lexer(object): """Lexer interface Method Signatures: @@ -274,13 +274,6 @@ class Lexer(Serialize): class TraditionalLexer(Lexer): - __serialize_fields__ = 'terminals', 'ignore_types', 'newline_types' - __serialize_namespace__ = TerminalDef, - - def _deserialize(self): - self.user_callbacks = {} # TODO implement - self.build() - def __init__(self, terminals, ignore=(), user_callbacks={}): assert all(isinstance(t, TerminalDef) for t in terminals), terminals @@ -329,9 +322,6 @@ class TraditionalLexer(Lexer): class ContextualLexer(Lexer): - __serialize_fields__ = 'root_lexer', 'lexers' - __serialize_namespace__ = TraditionalLexer, - def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): tokens_by_name = {} for t in terminals: diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 73c4611..f81001c 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -8,6 +8,7 @@ from .parsers import earley, xearley, cyk from .parsers.lalr_parser import LALR_Parser from .grammar import Rule from .tree import Tree +from .common import LexerConf ###{standalone @@ -50,34 +51,24 @@ class WithLexer(Serialize): parser = None lexer_conf = None - __serialize_fields__ = 'parser', 'lexer' - __serialize_namespace__ = Rule, ContextualLexer, TraditionalLexer + __serialize_fields__ = 'parser', 'lexer_conf' + __serialize_namespace__ = LexerConf, + + def __init__(self, lexer_conf, parser_conf, options=None): + self.lexer_conf = lexer_conf + self.postlex = lexer_conf.postlex @classmethod def deserialize(cls, data, memo, callbacks, postlex): inst = super(WithLexer, cls).deserialize(data, memo) inst.postlex = postlex inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) + inst.init_lexer() return inst def _serialize(self, data, memo): data['parser'] = data['parser'].serialize(memo) - def init_traditional_lexer(self, lexer_conf): - self.lexer_conf = lexer_conf - self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) - self.postlex = lexer_conf.postlex - - def init_contextual_lexer(self, lexer_conf): - self.lexer_conf = lexer_conf - self.postlex = lexer_conf.postlex - states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} - always_accept = self.postlex.always_accept if self.postlex else () - self.lexer = ContextualLexer(lexer_conf.tokens, states, - ignore=lexer_conf.ignore, - always_accept=always_accept, - user_callbacks=lexer_conf.callbacks) - def lex(self, text): stream = self.lexer.lex(text) return self.postlex.process(stream) if self.postlex else stream @@ -87,26 +78,40 @@ class WithLexer(Serialize): sps = self.lexer.set_parser_state return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else []) + def init_traditional_lexer(self): + self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) -class LALR_TraditionalLexer(WithLexer): +class LALR_WithLexer(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): debug = options.debug if options else False self.parser = LALR_Parser(parser_conf, debug=debug) - self.init_traditional_lexer(lexer_conf) + WithLexer.__init__(self, lexer_conf, parser_conf, options) -class LALR_ContextualLexer(WithLexer): - def __init__(self, lexer_conf, parser_conf, options=None): - debug = options.debug if options else False - self.parser = LALR_Parser(parser_conf, debug=debug) - self.init_contextual_lexer(lexer_conf) + self.init_lexer() + + def init_lexer(self): + raise NotImplementedError() + +class LALR_TraditionalLexer(LALR_WithLexer): + def init_lexer(self): + self.init_traditional_lexer() +class LALR_ContextualLexer(LALR_WithLexer): + def init_lexer(self): + states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} + always_accept = self.postlex.always_accept if self.postlex else () + self.lexer = ContextualLexer(self.lexer_conf.tokens, states, + ignore=self.lexer_conf.ignore, + always_accept=always_accept, + user_callbacks=self.lexer_conf.callbacks) ###} -class LALR_CustomLexer(WithLexer): +class LALR_CustomLexer(LALR_WithLexer): def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): - self.parser = LALR_Parser(parser_conf) - self.lexer_conf = lexer_conf - self.lexer = lexer_cls(lexer_conf) + pass # TODO + + def init_lexer(self): + self.lexer = lexer_cls(self.lexer_conf) def tokenize_text(text): line = 1 @@ -119,7 +124,8 @@ def tokenize_text(text): class Earley(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): - self.init_traditional_lexer(lexer_conf) + WithLexer.__init__(self, lexer_conf, parser_conf, options) + self.init_traditional_lexer() resolve_ambiguity = options.ambiguity == 'resolve' self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity) @@ -172,7 +178,8 @@ class XEarley_CompleteLex(XEarley): class CYK(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): - self.init_traditional_lexer(lexer_conf) + WithLexer.__init__(self, lexer_conf, parser_conf, options) + self.init_traditional_lexer() self._analysis = GrammarAnalyzer(parser_conf) self._parser = cyk.Parser(parser_conf.rules, parser_conf.start) diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 5510e3d..aea75ca 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -25,7 +25,8 @@ class LALR_Parser(object): @classmethod def deserialize(cls, data, memo, callbacks): inst = cls.__new__(cls) - inst.parser = _Parser(IntParseTable.deserialize(data, memo), callbacks) + inst._parse_table = IntParseTable.deserialize(data, memo) + inst.parser = _Parser(inst._parse_table, callbacks) return inst def serialize(self, memo): diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index ab334b5..07016ff 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -65,6 +65,7 @@ EXTRACT_STANDALONE_FILES = [ 'indenter.py', 'grammar.py', 'lexer.py', + 'common.py', 'parse_tree_builder.py', 'parsers/lalr_parser.py', 'parsers/lalr_analysis.py', diff --git a/lark/utils.py b/lark/utils.py index b1d9671..d46beec 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -103,7 +103,10 @@ class Serialize(object): inst = cls.__new__(cls) for f in fields: - setattr(inst, f, _deserialize(data[f], namespace, memo)) + try: + setattr(inst, f, _deserialize(data[f], namespace, memo)) + except KeyError as e: + raise KeyError("Cannot find key for class", cls, e) postprocess = getattr(inst, '_deserialize', None) if postprocess: postprocess() @@ -164,6 +167,15 @@ def smart_decorator(f, create_decorator): import sys, re Py36 = (sys.version_info[:2] >= (3, 6)) + +import sre_parse +import sre_constants +def get_regexp_width(regexp): + try: + return sre_parse.parse(regexp).getwidth() + except sre_constants.error: + raise ValueError(regexp) + ###} @@ -209,14 +221,6 @@ except NameError: return -1 -import sre_parse -import sre_constants -def get_regexp_width(regexp): - try: - return sre_parse.parse(regexp).getwidth() - except sre_constants.error: - raise ValueError(regexp) - class Enumerator(Serialize): def __init__(self): diff --git a/tests/test_tools.py b/tests/test_tools.py index e1c49c4..ff823ec 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -13,6 +13,9 @@ try: except ImportError: from io import StringIO + + + class TestStandalone(TestCase): def setUp(self): pass @@ -74,6 +77,39 @@ class TestStandalone(TestCase): x = l2.parse('ABAB') self.assertEqual(x, ['a', 'b']) + def test_postlex(self): + from lark.indenter import Indenter + class MyIndenter(Indenter): + NL_type = '_NEWLINE' + OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE'] + CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE'] + INDENT_type = '_INDENT' + DEDENT_type = '_DEDENT' + tab_len = 8 + + grammar = r""" + start: "(" ")" _NEWLINE + _NEWLINE: /\n/ + """ + + # from lark import Lark + # l = Lark(grammar, parser='lalr', lexer='contextual', postlex=MyIndenter()) + # x = l.parse('(\n)\n') + # print('@@', x) + + + context = self._create_standalone(grammar) + _Lark = context['Lark_StandAlone'] + + # l = _Lark(postlex=MyIndenter()) + # x = l.parse('()\n') + # print(x) + l = _Lark(postlex=MyIndenter()) + x = l.parse('(\n)\n') + print(x) + + + if __name__ == '__main__': unittest.main()