From d13ebb9c15a16ac31449827bcf868a61bd4fcc1e Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 8 Apr 2019 20:29:01 +0300 Subject: [PATCH] Using a mostly-generic serialization method --- lark/grammar.py | 52 +++++++--------------- lark/lark.py | 17 +++---- lark/lexer.py | 83 +++++++++-------------------------- lark/parser_frontends.py | 38 +++++++--------- lark/parsers/lalr_analysis.py | 32 +++++++++++++- lark/parsers/lalr_parser.py | 38 +++------------- lark/utils.py | 48 ++++++++++++++++++++ 7 files changed, 145 insertions(+), 163 deletions(-) diff --git a/lark/grammar.py b/lark/grammar.py index 0dc1c21..adde150 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -1,4 +1,6 @@ -class Symbol(object): +from .utils import Serialize + +class Symbol(Serialize): is_term = NotImplemented def __init__(self, name): @@ -19,16 +21,10 @@ class Symbol(object): fullrepr = property(__repr__) - @classmethod - def deserialize(cls, data): - class_ = { - 'T': Terminal, - 'NT': NonTerminal, - }[data[0]] - return class_(*data[1:]) - class Terminal(Symbol): + __serialize_fields__ = 'name', 'filter_out' + is_term = True def __init__(self, name, filter_out=False): @@ -39,23 +35,25 @@ class Terminal(Symbol): def fullrepr(self): return '%s(%r, %r)' % (type(self).__name__, self.name, self.filter_out) - def serialize(self): - return ['T', self.name, self.filter_out] class NonTerminal(Symbol): + __serialize_fields__ = 'name', + is_term = False - def serialize(self): - return ['NT', self.name] -class Rule(object): +class Rule(Serialize): """ origin : a symbol expansion : a list of symbols order : index of this expansion amongst all rules of the same name """ __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash') + + __serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options' + __serialize_namespace__ = lambda: (Terminal, NonTerminal, RuleOptions) + def __init__(self, origin, expansion, order=0, alias=None, options=None): self.origin = origin self.expansion = expansion @@ -64,6 +62,8 @@ class Rule(object): self.options = options self._hash = hash((self.origin, tuple(self.expansion))) + def _deserialize(self): + self._hash = hash((self.origin, tuple(self.expansion))) def __str__(self): return '<%s : %s>' % (self.origin.name, ' '.join(x.name for x in self.expansion)) @@ -79,22 +79,11 @@ class Rule(object): return False return self.origin == other.origin and self.expansion == other.expansion - def serialize(self): - return [self.origin.serialize(), list(s.serialize() for s in self.expansion), self.order, self.alias, self.options.serialize() if self.options else None] - - @classmethod - def deserialize(cls, data): - origin, expansion, order, alias, options = data - return cls( - Symbol.deserialize(origin), - [Symbol.deserialize(s) for s in expansion], - order, - alias, - RuleOptions.deserialize(options) if options else None - ) -class RuleOptions: +class RuleOptions(Serialize): + __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'empty_indices' + def __init__(self, keep_all_tokens=False, expand1=False, priority=None, empty_indices=()): self.keep_all_tokens = keep_all_tokens self.expand1 = expand1 @@ -107,10 +96,3 @@ class RuleOptions: self.expand1, self.priority, ) - - def serialize(self): - return [self.keep_all_tokens, self.expand1, self.priority, list(self.empty_indices)] - - @classmethod - def deserialize(cls, data): - return cls(*data) diff --git a/lark/lark.py b/lark/lark.py index abac7fc..3d81b21 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -5,7 +5,7 @@ import time from collections import defaultdict from io import open -from .utils import STRING_TYPE +from .utils import STRING_TYPE, Serialize from .load_grammar import load_grammar from .tree import Tree from .common import LexerConf, ParserConf @@ -13,9 +13,10 @@ from .common import LexerConf, ParserConf from .lexer import Lexer, TraditionalLexer from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import get_frontend +from .grammar import Rule -class LarkOptions(object): +class LarkOptions(Serialize): """Specifies the options for Lark """ @@ -132,7 +133,7 @@ class Profiler: return wrapper -class Lark: +class Lark(Serialize): def __init__(self, grammar, **options): """ grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax) @@ -223,6 +224,8 @@ class Lark: if __init__.__doc__: __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC + __serialize_fields__ = 'parser', 'rules', 'options' + def _build_lexer(self): return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) @@ -236,16 +239,8 @@ class Lark: parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) return self.parser_class(self.lexer_conf, parser_conf, options=self.options) - def serialize(self): - return { - 'parser': self.parser.serialize(), - 'rules': [r.serialize() for r in self.rules], - 'options': self.options.serialize(), - } - @classmethod def deserialize(cls, data): - from .grammar import Rule inst = cls.__new__(cls) inst.options = LarkOptions.deserialize(data['options']) inst.rules = [Rule.deserialize(r) for r in data['rules']] diff --git a/lark/lexer.py b/lark/lexer.py index 29ba1f6..ae370dc 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -2,10 +2,12 @@ import re -from .utils import Str, classify, get_regexp_width, Py36 +from .utils import Str, classify, get_regexp_width, Py36, Serialize from .exceptions import UnexpectedCharacters, LexError -class Pattern(object): +class Pattern(Serialize): + __serialize_fields__ = 'value', 'flags' + def __init__(self, value, flags=()): self.value = value self.flags = frozenset(flags) @@ -35,15 +37,6 @@ class Pattern(object): value = ('(?%s)' % f) + value return value - @classmethod - def deserialize(cls, data): - class_ = { - 's': PatternStr, - 're': PatternRE, - }[data[0]] - value, flags = data[1:] - return class_(value, frozenset(flags)) - class PatternStr(Pattern): def to_regexp(self): @@ -54,9 +47,6 @@ class PatternStr(Pattern): return len(self.value) max_width = min_width - def serialize(self): - return ['s', self.value, list(self.flags)] - class PatternRE(Pattern): def to_regexp(self): return self._get_flags(self.value) @@ -68,10 +58,11 @@ class PatternRE(Pattern): def max_width(self): return get_regexp_width(self.to_regexp())[1] - def serialize(self): - return ['re', self.value, list(self.flags)] -class TerminalDef(object): +class TerminalDef(Serialize): + __serialize_fields__ = 'name', 'pattern', 'priority' + __serialize_namespace__ = lambda: (PatternStr, PatternRE) + def __init__(self, name, pattern, priority=1): assert isinstance(pattern, Pattern), pattern self.name = name @@ -81,14 +72,6 @@ class TerminalDef(object): def __repr__(self): return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) - def serialize(self): - return [self.name, self.pattern.serialize(), self.priority] - - @classmethod - def deserialize(cls, data): - name, pattern, priority = data - return cls(name, Pattern.deserialize(pattern), priority) - ###{standalone @@ -278,7 +261,7 @@ def _regexp_has_newline(r): """ return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r) -class Lexer: +class Lexer(Serialize): """Lexer interface Method Signatures: @@ -289,15 +272,16 @@ class Lexer: set_parser_state = NotImplemented lex = NotImplemented - @classmethod - def deserialize(cls, data): - class_ = { - 'traditional': TraditionalLexer, - 'contextual': ContextualLexer, - }[data['type']] - return class_.deserialize(data) class TraditionalLexer(Lexer): + __serialize_fields__ = 'terminals', 'ignore_types', 'newline_types' + __serialize_namespace__ = lambda: (TerminalDef,) + + def _deserialize(self): + self.mres = build_mres(self.terminals) + self.callback = {} # TODO implement + + def __init__(self, terminals, ignore=(), user_callbacks={}): assert all(isinstance(t, TerminalDef) for t in terminals), terminals @@ -339,26 +323,13 @@ class TraditionalLexer(Lexer): def lex(self, stream): return _Lex(self).lex(stream, self.newline_types, self.ignore_types) - def serialize(self): - return { - 'type': 'traditional', - 'terminals': [t.serialize() for t in self.terminals], - 'ignore_types': self.ignore_types, - 'newline_types': self.newline_types, - } - @classmethod - def deserialize(cls, data): - inst = cls.__new__(cls) - inst.terminals = [TerminalDef.deserialize(t) for t in data['terminals']] - inst.mres = build_mres(inst.terminals) - inst.ignore_types = data['ignore_types'] - inst.newline_types = data['newline_types'] - inst.callback = {} # TODO implement - return inst class ContextualLexer(Lexer): + __serialize_fields__ = 'root_lexer', 'lexers' + __serialize_namespace__ = lambda: (TraditionalLexer,) + def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): tokens_by_name = {} for t in terminals: @@ -392,17 +363,3 @@ class ContextualLexer(Lexer): yield x l.lexer = self.lexers[self.parser_state] l.state = self.parser_state - - def serialize(self): - return { - 'type': 'contextual', - 'root_lexer': self.root_lexer.serialize(), - 'lexers': {state: lexer.serialize() for state, lexer in self.lexers.items()} - } - - @classmethod - def deserialize(cls, data): - inst = cls.__new__(cls) - inst.lexers = {state:Lexer.deserialize(lexer) for state, lexer in data['lexers'].items()} - inst.root_lexer = TraditionalLexer.deserialize(data['root_lexer']) - return inst \ No newline at end of file diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 8ef3c8e..7c47173 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,17 +1,31 @@ import re from functools import partial -from .utils import get_regexp_width +from .utils import get_regexp_width, Serialize from .parsers.grammar_analysis import GrammarAnalyzer from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token from .parsers import lalr_parser, earley, xearley, cyk +from .grammar import Rule from .tree import Tree -class WithLexer(object): +class WithLexer(Serialize): lexer = None parser = None lexer_conf = None + __serialize_fields__ = 'parser', 'lexer' + __serialize_namespace__ = lambda: (Rule, ContextualLexer, LALR_ContextualLexer) + + @classmethod + def deserialize(cls, data, callbacks): + inst = super(WithLexer, cls).deserialize(data) + inst.postlex = None # TODO + inst.parser = lalr_parser.Parser.deserialize(inst.parser, callbacks) + return inst + + def _serialize(self, data): + data['parser'] = data['parser'].serialize() + def init_traditional_lexer(self, lexer_conf): self.lexer_conf = lexer_conf self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) @@ -36,26 +50,6 @@ class WithLexer(object): sps = self.lexer.set_parser_state return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else []) - def serialize(self): - return { - 'type': type(self).__name__, - 'parser': self.parser.serialize(), - 'lexer': self.lexer.serialize(), - } - @classmethod - def deserialize(cls, data, callbacks): - class_ = { - 'LALR_TraditionalLexer': LALR_TraditionalLexer, - 'LALR_ContextualLexer': LALR_ContextualLexer, - }[data['type']] - parser = lalr_parser.Parser.deserialize(data['parser'], callbacks) - assert parser - inst = class_.__new__(class_) - inst.parser = parser - inst.lexer = Lexer.deserialize(data['lexer']) - inst.postlex = None # TODO - return inst - class LALR_TraditionalLexer(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 950fb27..3dced8e 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -9,7 +9,7 @@ For now, shift/reduce conflicts are automatically resolved as shifts. import logging from collections import defaultdict -from ..utils import classify, classify_bool, bfs, fzset +from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator from ..exceptions import GrammarError from .grammar_analysis import GrammarAnalyzer, Terminal @@ -31,6 +31,36 @@ class ParseTable: self.start_state = start_state self.end_state = end_state + def serialize(self): + tokens = Enumerator() + rules = Enumerator() + + states = { + state: {tokens.get(token): ((1, rules.get(arg)) if action is Reduce else (0, arg)) + for token, (action, arg) in actions.items()} + for state, actions in self.states.items() + } + + return { + 'tokens': tokens.reversed(), + 'rules': {idx: r.serialize() for idx, r in rules.reversed().items()}, + 'states': states, + 'start_state': self.start_state, + 'end_state': self.end_state, + } + + @classmethod + def deserialize(cls, data): + tokens = data['tokens'] + rules = data['rules'] + states = { + state: {tokens[token]: ((Reduce, rules[arg]) if action==1 else (Shift, arg)) + for token, (action, arg) in actions.items()} + for state, actions in data['states'].items() + } + return cls(states, data['start_state'], data['end_state']) + + class IntParseTable(ParseTable): @classmethod diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index a7eebf6..701d158 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -5,12 +5,12 @@ from ..exceptions import UnexpectedToken from ..lexer import Token from ..grammar import Rule -from ..utils import Enumerator +from ..utils import Enumerator, Serialize from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable -class Parser(object): +class Parser: def __init__(self, parser_conf, debug=False): assert all(r.options is None or r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization" @@ -21,42 +21,18 @@ class Parser(object): self._parse_table = analysis.parse_table self.parser_conf = parser_conf self.parser = _Parser(analysis.parse_table, callbacks) - self.parse = self.parser.parse - def serialize(self): - tokens = Enumerator() - rules = Enumerator() - - states = { - state: {tokens.get(token): ((1, rules.get(arg)) if action is Reduce else (0, arg)) - for token, (action, arg) in actions.items()} - for state, actions in self._parse_table.states.items() - } - - return { - 'tokens': tokens.reversed(), - 'rules': {idx: r.serialize() for idx, r in rules.reversed().items()}, - 'states': states, - 'start_state': self._parse_table.start_state, - 'end_state': self._parse_table.end_state, - } - @classmethod def deserialize(cls, data, callbacks): - tokens = data['tokens'] - rules = {idx: Rule.deserialize(r) for idx, r in data['rules'].items()} - states = { - state: {tokens[token]: ((Reduce, rules[arg]) if action==1 else (Shift, arg)) - for token, (action, arg) in actions.items()} - for state, actions in data['states'].items() - } - parse_table = IntParseTable(states, data['start_state'], data['end_state']) inst = cls.__new__(cls) - inst.parser = _Parser(parse_table, callbacks) - inst.parse = inst.parser.parse + inst.parser = _Parser(IntParseTable.deserialize(data), callbacks) return inst + def serialize(self): + return self._parse_table.serialize() + def parse(self, *args): + return self.parser.parse(*args) ###{standalone diff --git a/lark/utils.py b/lark/utils.py index d65cac2..3dda697 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -44,6 +44,54 @@ def bfs(initial, expand): +def _serialize(value): + if isinstance(value, Serialize): + return value.serialize() + elif isinstance(value, list): + return [_serialize(elem) for elem in value] + elif isinstance(value, frozenset): + return list(value) # TODO reversible? + elif isinstance(value, dict): + return {key:_serialize(elem) for key, elem in value.items()} + return value + +def _deserialize(data, namespace): + if isinstance(data, dict): + if '__type__' in data: # Object + class_ = namespace[data['__type__']] + return class_.deserialize(data) + return {key:_deserialize(value, namespace) for key, value in data.items()} + elif isinstance(data, list): + return [_deserialize(value, namespace) for value in data] + return data + + +class Serialize(object): + def serialize(self): + fields = getattr(self, '__serialize_fields__') + res = {f: _serialize(getattr(self, f)) for f in fields} + res['__type__'] = type(self).__name__ + postprocess = getattr(self, '_serialize', None) + if postprocess: + postprocess(res) + return res + + @classmethod + def deserialize(cls, data): + namespace = getattr(cls, '__serialize_namespace__', dict) + namespace = {c.__name__:c for c in namespace()} + + fields = getattr(cls, '__serialize_fields__') + + inst = cls.__new__(cls) + for f in fields: + setattr(inst, f, _deserialize(data[f], namespace)) + postprocess = getattr(inst, '_deserialize', None) + if postprocess: + postprocess() + return inst + + ###{standalone try: