diff --git a/lark/grammar.py b/lark/grammar.py index adde150..14893fb 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -1,5 +1,7 @@ from .utils import Serialize +###{standalone + class Symbol(Serialize): is_term = NotImplemented @@ -43,6 +45,24 @@ class NonTerminal(Symbol): is_term = False + +class RuleOptions(Serialize): + __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'empty_indices' + + def __init__(self, keep_all_tokens=False, expand1=False, priority=None, empty_indices=()): + self.keep_all_tokens = keep_all_tokens + self.expand1 = expand1 + self.priority = priority + self.empty_indices = empty_indices + + def __repr__(self): + return 'RuleOptions(%r, %r, %r)' % ( + self.keep_all_tokens, + self.expand1, + self.priority, + ) + + class Rule(Serialize): """ origin : a symbol @@ -52,7 +72,7 @@ class Rule(Serialize): __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash') __serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options' - __serialize_namespace__ = lambda: (Terminal, NonTerminal, RuleOptions) + __serialize_namespace__ = Terminal, NonTerminal, RuleOptions def __init__(self, origin, expansion, order=0, alias=None, options=None): self.origin = origin @@ -81,18 +101,4 @@ class Rule(Serialize): -class RuleOptions(Serialize): - __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'empty_indices' - - def __init__(self, keep_all_tokens=False, expand1=False, priority=None, empty_indices=()): - self.keep_all_tokens = keep_all_tokens - self.expand1 = expand1 - self.priority = priority - self.empty_indices = empty_indices - - def __repr__(self): - return 'RuleOptions(%r, %r, %r)' % ( - self.keep_all_tokens, - self.expand1, - self.priority, - ) +###} \ No newline at end of file diff --git a/lark/lark.py b/lark/lark.py index 3d81b21..1309c60 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -15,6 +15,7 @@ from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import get_frontend from .grammar import Rule +###{standalone class LarkOptions(Serialize): """Specifies the options for Lark @@ -101,11 +102,11 @@ class LarkOptions(Serialize): assert name in self.options self.options[name] = value - def serialize(self): + def serialize(self, memo): return self.options @classmethod - def deserialize(cls, data): + def deserialize(cls, data, memo): return cls(data) @@ -240,12 +241,12 @@ class Lark(Serialize): return self.parser_class(self.lexer_conf, parser_conf, options=self.options) @classmethod - def deserialize(cls, data): + def deserialize(cls, data, memo): inst = cls.__new__(cls) - inst.options = LarkOptions.deserialize(data['options']) - inst.rules = [Rule.deserialize(r) for r in data['rules']] + inst.options = LarkOptions.deserialize(data['options'], memo) + inst.rules = [Rule.deserialize(r, memo) for r in data['rules']] inst._prepare_callbacks() - inst.parser = inst.parser_class.deserialize(data['parser'], inst._callbacks) + inst.parser = inst.parser_class.deserialize(data['parser'], memo, inst._callbacks) return inst @@ -284,14 +285,4 @@ class Lark(Serialize): "Parse the given text, according to the options provided. Returns a tree, unless specified otherwise." return self.parser.parse(text) - # if self.profiler: - # self.profiler.enter_section('lex') - # l = list(self.lex(text)) - # self.profiler.enter_section('parse') - # try: - # return self.parser.parse(l) - # finally: - # self.profiler.enter_section('outside_lark') - # else: - # l = list(self.lex(text)) - # return self.parser.parse(l) +###} \ No newline at end of file diff --git a/lark/lexer.py b/lark/lexer.py index ae370dc..080770d 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -5,6 +5,8 @@ import re from .utils import Str, classify, get_regexp_width, Py36, Serialize from .exceptions import UnexpectedCharacters, LexError +###{standalone + class Pattern(Serialize): __serialize_fields__ = 'value', 'flags' @@ -61,7 +63,7 @@ class PatternRE(Pattern): class TerminalDef(Serialize): __serialize_fields__ = 'name', 'pattern', 'priority' - __serialize_namespace__ = lambda: (PatternStr, PatternRE) + __serialize_namespace__ = PatternStr, PatternRE def __init__(self, name, pattern, priority=1): assert isinstance(pattern, Pattern), pattern @@ -74,7 +76,6 @@ class TerminalDef(Serialize): -###{standalone class Token(Str): __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column') @@ -205,7 +206,6 @@ class CallChain: return self.callback2(t) if self.cond(t2) else t2 -###} @@ -275,7 +275,7 @@ class Lexer(Serialize): class TraditionalLexer(Lexer): __serialize_fields__ = 'terminals', 'ignore_types', 'newline_types' - __serialize_namespace__ = lambda: (TerminalDef,) + __serialize_namespace__ = TerminalDef, def _deserialize(self): self.mres = build_mres(self.terminals) @@ -328,7 +328,7 @@ class TraditionalLexer(Lexer): class ContextualLexer(Lexer): __serialize_fields__ = 'root_lexer', 'lexers' - __serialize_namespace__ = lambda: (TraditionalLexer,) + __serialize_namespace__ = TraditionalLexer, def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): tokens_by_name = {} @@ -363,3 +363,5 @@ class ContextualLexer(Lexer): yield x l.lexer = self.lexers[self.parser_state] l.state = self.parser_state + +###} \ No newline at end of file diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 7c47173..6750480 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -4,26 +4,29 @@ from functools import partial from .utils import get_regexp_width, Serialize from .parsers.grammar_analysis import GrammarAnalyzer from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token -from .parsers import lalr_parser, earley, xearley, cyk +from .parsers import earley, xearley, cyk +from .parsers.lalr_parser import LALR_Parser from .grammar import Rule from .tree import Tree +###{standalone + class WithLexer(Serialize): lexer = None parser = None lexer_conf = None __serialize_fields__ = 'parser', 'lexer' - __serialize_namespace__ = lambda: (Rule, ContextualLexer, LALR_ContextualLexer) + __serialize_namespace__ = Rule, ContextualLexer @classmethod - def deserialize(cls, data, callbacks): - inst = super(WithLexer, cls).deserialize(data) + def deserialize(cls, data, memo, callbacks): + inst = super(WithLexer, cls).deserialize(data, memo) inst.postlex = None # TODO - inst.parser = lalr_parser.Parser.deserialize(inst.parser, callbacks) + inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) return inst - def _serialize(self, data): + def _serialize(self, data, memo): data['parser'] = data['parser'].serialize() def init_traditional_lexer(self, lexer_conf): @@ -54,18 +57,18 @@ class WithLexer(Serialize): class LALR_TraditionalLexer(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): debug = options.debug if options else False - self.parser = lalr_parser.Parser(parser_conf, debug=debug) + self.parser = LALR_Parser(parser_conf, debug=debug) self.init_traditional_lexer(lexer_conf) class LALR_ContextualLexer(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): debug = options.debug if options else False - self.parser = lalr_parser.Parser(parser_conf, debug=debug) + self.parser = LALR_Parser(parser_conf, debug=debug) self.init_contextual_lexer(lexer_conf) class LALR_CustomLexer(WithLexer): def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): - self.parser = lalr_parser.Parser(parser_conf) + self.parser = LALR_Parser(parser_conf) self.lexer_conf = lexer_conf self.lexer = lexer_cls(lexer_conf) @@ -190,3 +193,5 @@ def get_frontend(parser, lexer): + +###} \ No newline at end of file diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 3dced8e..e34b8c3 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -14,6 +14,8 @@ from ..exceptions import GrammarError from .grammar_analysis import GrammarAnalyzer, Terminal +###{standalone + class Action: def __init__(self, name): self.name = name @@ -50,7 +52,7 @@ class ParseTable: } @classmethod - def deserialize(cls, data): + def deserialize(cls, data, memo): tokens = data['tokens'] rules = data['rules'] states = { @@ -79,8 +81,7 @@ class IntParseTable(ParseTable): end_state = state_to_idx[parse_table.end_state] return cls(int_states, start_state, end_state) - - +###} class LALR_Analyzer(GrammarAnalyzer): diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 987ef34..241a47e 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -9,7 +9,8 @@ from ..utils import Enumerator, Serialize from .lalr_analysis import LALR_Analyzer, Shift, IntParseTable -class Parser: +###{standalone +class LALR_Parser(object): def __init__(self, parser_conf, debug=False): assert all(r.options is None or r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization" @@ -22,9 +23,9 @@ class Parser: self.parser = _Parser(analysis.parse_table, callbacks) @classmethod - def deserialize(cls, data, callbacks): + def deserialize(cls, data, memo, callbacks): inst = cls.__new__(cls) - inst.parser = _Parser(IntParseTable.deserialize(data), callbacks) + inst.parser = _Parser(IntParseTable.deserialize(data, memo), callbacks) return inst def serialize(self): @@ -33,7 +34,6 @@ class Parser: def parse(self, *args): return self.parser.parse(*args) -###{standalone class _Parser: def __init__(self, parse_table, callbacks): diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 3e6cc5d..99e1929 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -36,6 +36,7 @@ # ###} +import pprint import codecs import sys import os @@ -47,6 +48,10 @@ import lark from lark import Lark from lark.parsers.lalr_analysis import Reduce + +from lark.grammar import RuleOptions +from lark.lexer import TerminalDef + _dir = path.dirname(__file__) _larkdir = path.join(_dir, path.pardir) @@ -61,9 +66,12 @@ EXTRACT_STANDALONE_FILES = [ 'lexer.py', 'parse_tree_builder.py', 'parsers/lalr_parser.py', + 'parsers/lalr_analysis.py', + 'parser_frontends.py', + 'lark.py', + 'grammar.py', ] - def extract_sections(lines): section = None text = [] @@ -83,152 +91,34 @@ def extract_sections(lines): return {name:''.join(text) for name, text in sections.items()} -def _prepare_mres(mres): - return [(p.pattern,{i: t for i, t in d.items()}) for p,d in mres] - -class TraditionalLexerAtoms: - def __init__(self, lexer): - self.mres = _prepare_mres(lexer.mres) - self.newline_types = lexer.newline_types - self.ignore_types = lexer.ignore_types - self.callback = {name:_prepare_mres(c.mres) - for name, c in lexer.callback.items()} - - def print_python(self): - print('import re') - print('class LexerRegexps: pass') - print('NEWLINE_TYPES = %s' % self.newline_types) - print('IGNORE_TYPES = %s' % self.ignore_types) - self._print_python('lexer') - - def _print_python(self, var_name): - print('MRES = (') - pprint(self.mres) - print(')') - print('LEXER_CALLBACK = (') - pprint(self.callback) - print(')') - print('lexer_regexps = LexerRegexps()') - print('lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES]') - print('lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres])') - print(' for n, mres in LEXER_CALLBACK.items()}') - print('%s = (lexer_regexps)' % var_name) - - -class ContextualLexerAtoms: - def __init__(self, lexer): - self.lexer_atoms = {state: TraditionalLexerAtoms(lexer) for state, lexer in lexer.lexers.items()} - self.root_lexer_atoms = TraditionalLexerAtoms(lexer.root_lexer) - - def print_python(self): - print('import re') - print('class LexerRegexps: pass') - print('NEWLINE_TYPES = %s' % self.root_lexer_atoms.newline_types) - print('IGNORE_TYPES = %s' % self.root_lexer_atoms.ignore_types) - - print('LEXERS = {}') - for state, lexer_atoms in self.lexer_atoms.items(): - lexer_atoms._print_python('LEXERS[%d]' % state) - - print('class ContextualLexer:') - print(' def __init__(self):') - print(' self.lexers = LEXERS') - print(' self.set_parser_state(None)') - print(' def set_parser_state(self, state):') - print(' self.parser_state = state') - print(' def lex(self, stream):') - print(' newline_types = NEWLINE_TYPES') - print(' ignore_types = IGNORE_TYPES') - print(' lexers = LEXERS') - print(' l = _Lex(lexers[self.parser_state], self.parser_state)') - print(' for x in l.lex(stream, newline_types, ignore_types):') - print(' yield x') - print(' l.lexer = lexers[self.parser_state]') - print(' l.state = self.parser_state') - - print('CON_LEXER = ContextualLexer()') - print('def lex(stream):') - print(' return CON_LEXER.lex(stream)') - -class GetRule: - def __init__(self, rule_id): - self.rule_id = rule_id - - def __repr__(self): - return 'RULES[%d]' % self.rule_id - -rule_ids = {} -token_types = {} - -def _get_token_type(token_type): - if token_type not in token_types: - token_types[token_type] = len(token_types) - return token_types[token_type] - -class ParserAtoms: - def __init__(self, parser): - self.parse_table = parser._parse_table - - def print_python(self): - print('class ParseTable: pass') - print('parse_table = ParseTable()') - print('STATES = {') - for state, actions in self.parse_table.states.items(): - print(' %r: %r,' % (state, {_get_token_type(token): ((1, rule_ids[arg]) if action is Reduce else (0, arg)) - for token, (action, arg) in actions.items()})) - print('}') - print('TOKEN_TYPES = (') - pprint({v:k for k, v in token_types.items()}) - print(')') - print('parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()}') - print(' for s, acts in STATES.items()}') - print('parse_table.start_state = %s' % self.parse_table.start_state) - print('parse_table.end_state = %s' % self.parse_table.end_state) - print('class Lark_StandAlone:') - print(' def __init__(self, transformer=None, postlex=None):') - print(' callbacks = parse_tree_builder.create_callback(transformer=transformer)') - print(' self.parser = _Parser(parse_table, callbacks)') - print(' self.postlex = postlex') - print(' def parse(self, stream):') - print(' tokens = lex(stream)') - print(' sps = CON_LEXER.set_parser_state') - print(' if self.postlex: tokens = self.postlex.process(tokens)') - print(' return self.parser.parse(tokens, sps)') - -class TreeBuilderAtoms: - def __init__(self, lark): - self.rules = lark.rules - - def print_python(self): - # print('class InlineTransformer: pass') - print('RULES = {') - for i, r in enumerate(self.rules): - rule_ids[r] = i - print(' %d: Rule(%r, [%s], alias=%r, options=%r),' % (i, r.origin, ', '.join(s.fullrepr for s in r.expansion), r.alias, r.options )) - print('}') - print('parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree)') def main(fobj, start): lark_inst = Lark(fobj, parser="lalr", lexer="contextual", start=start) - lexer_atoms = ContextualLexerAtoms(lark_inst.parser.lexer) - parser_atoms = ParserAtoms(lark_inst.parser.parser) - tree_builder_atoms = TreeBuilderAtoms(lark_inst) - print('# The file was automatically generated by Lark v%s' % lark.__version__) for pyfile in EXTRACT_STANDALONE_FILES: with open(os.path.join(_larkdir, pyfile)) as f: print (extract_sections(f)['standalone']) - with open(os.path.join(_larkdir, 'grammar.py')) as grammar_py: - print(grammar_py.read()) + data, m = lark_inst.memo_serialize([TerminalDef]) + print( 'DATA = (' ) + # pprint(data, width=160) + print(data) + print(')') + print( 'MEMO = (') + print(m) + print(')') + print('Shift = 0') print('Reduce = 1') - lexer_atoms.print_python() - tree_builder_atoms.print_python() - parser_atoms.print_python() + print("def load_parser():") + print(" return Lark.deserialize(DATA)") + + + + if __name__ == '__main__': if len(sys.argv) < 2: diff --git a/lark/utils.py b/lark/utils.py index 3dda697..0849745 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -1,8 +1,6 @@ import sys from collections import deque -Py36 = (sys.version_info[:2] >= (3, 6)) - class fzset(frozenset): def __repr__(self): return '{%s}' % ', '.join(map(repr, self)) @@ -44,56 +42,90 @@ def bfs(initial, expand): -def _serialize(value): +###{standalone +import sys, re + +Py36 = (sys.version_info[:2] >= (3, 6)) + + + +def _serialize(value, memo): if isinstance(value, Serialize): - return value.serialize() + return value.serialize(memo) elif isinstance(value, list): - return [_serialize(elem) for elem in value] + return [_serialize(elem, memo) for elem in value] elif isinstance(value, frozenset): return list(value) # TODO reversible? elif isinstance(value, dict): - return {key:_serialize(elem) for key, elem in value.items()} + return {key:_serialize(elem, memo) for key, elem in value.items()} return value -def _deserialize(data, namespace): +def _deserialize(data, namespace, memo): if isinstance(data, dict): if '__type__' in data: # Object class_ = namespace[data['__type__']] - return class_.deserialize(data) - return {key:_deserialize(value, namespace) for key, value in data.items()} + return class_.deserialize(data, memo) + return {key:_deserialize(value, namespace, memo) for key, value in data.items()} elif isinstance(data, list): - return [_deserialize(value, namespace) for value in data] + return [_deserialize(value, namespace, memo) for value in data] return data class Serialize(object): - def serialize(self): + def memo_serialize(self, types_to_memoize): + memo = SerializeMemoizer(types_to_memoize) + return self.serialize(memo), memo.serialize() + + def serialize(self, memo=None): + if memo and memo.in_types(self): + return {'__memo__': memo.memoized.get(self)} + fields = getattr(self, '__serialize_fields__') - res = {f: _serialize(getattr(self, f)) for f in fields} + res = {f: _serialize(getattr(self, f), memo) for f in fields} res['__type__'] = type(self).__name__ postprocess = getattr(self, '_serialize', None) if postprocess: - postprocess(res) + postprocess(res, memo) return res @classmethod - def deserialize(cls, data): - namespace = getattr(cls, '__serialize_namespace__', dict) - namespace = {c.__name__:c for c in namespace()} + def deserialize(cls, data, memo): + namespace = getattr(cls, '__serialize_namespace__', {}) + namespace = {c.__name__:c for c in namespace} fields = getattr(cls, '__serialize_fields__') + if '__memo__' in data: + return memo[data['__memo__']] + inst = cls.__new__(cls) for f in fields: - setattr(inst, f, _deserialize(data[f], namespace)) + setattr(inst, f, _deserialize(data[f], namespace, memo)) postprocess = getattr(inst, '_deserialize', None) if postprocess: postprocess() return inst +class SerializeMemoizer(Serialize): + __serialize_fields__ = 'memoized', + + def __init__(self, types_to_memoize): + self.types_to_memoize = tuple(types_to_memoize) + self.memoized = Enumerator() + + def in_types(self, value): + return isinstance(value, self.types_to_memoize) + + def serialize(self): + return _serialize(self.memoized.reversed(), None) + + @classmethod + def deserialize(cls, data, namespace, memo): + return _deserialize(data, namespace, memo) + + -###{standalone try: STRING_TYPE = basestring except NameError: # Python 3 @@ -178,7 +210,7 @@ def get_regexp_width(regexp): raise ValueError(regexp) -class Enumerator: +class Enumerator(Serialize): def __init__(self): self.enums = {}