From 120d5b9ffa2a41059b7fd35a892a8a5ff8aef7c4 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 30 Mar 2019 23:11:39 +0300 Subject: [PATCH 01/14] Mid work --- lark/grammar.py | 14 ++++++++++ lark/lark.py | 1 + lark/lexer.py | 12 ++++++++ lark/parser_frontends.py | 16 ++++++++++- lark/parsers/lalr_parser.py | 56 +++++++++++++++++++++++++++++++++++-- 5 files changed, 96 insertions(+), 3 deletions(-) diff --git a/lark/grammar.py b/lark/grammar.py index e171d52..8691f10 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -30,10 +30,16 @@ class Terminal(Symbol): def fullrepr(self): return '%s(%r, %r)' % (type(self).__name__, self.name, self.filter_out) + def serialize(self): + return ['T', self.name, self.filter_out] + class NonTerminal(Symbol): is_term = False + def serialize(self): + return ['NT', self.name] + class Rule(object): """ origin : a symbol @@ -64,6 +70,11 @@ class Rule(object): return False return self.origin == other.origin and self.expansion == other.expansion + def serialize(self): + return [self.origin.serialize(), list(s.serialize() for s in self.expansion), self.alias, self.options.serialize() if self.options else None] + # def deserialize(self): + # return [self.origin.serialize(), list(s.serialize() for s in self.expansion), self.alias, self.options.serialize() if self.options else None] + class RuleOptions: def __init__(self, keep_all_tokens=False, expand1=False, priority=None): @@ -78,3 +89,6 @@ class RuleOptions: self.expand1, self.priority, ) + + def serialize(self): + return [self.keep_all_tokens, self.expand1, self.priority, list(self.empty_indices)] \ No newline at end of file diff --git a/lark/lark.py b/lark/lark.py index 178141c..eb73271 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -208,6 +208,7 @@ class Lark: return self.parser_class(self.lexer_conf, parser_conf, options=self.options) + @classmethod def open(cls, grammar_filename, rel_to=None, **options): """Create an instance of Lark with the grammar given by its filename diff --git a/lark/lexer.py b/lark/lexer.py index e6e9e9e..00ff35c 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -65,6 +65,9 @@ class TerminalDef(object): def __repr__(self): return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) + def serialize(self): + return [self.name, self.pattern, self.priority] + ###{standalone @@ -307,6 +310,13 @@ class TraditionalLexer(Lexer): def lex(self, stream): return _Lex(self).lex(stream, self.newline_types, self.ignore_types) + def serialize(self): + return { + 'terminals': [t.serialize() for t in self.terminals], + 'ignore_types': self.ignore_types, + 'newline_types': self.newline_types, + } + class ContextualLexer(Lexer): def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): @@ -343,4 +353,6 @@ class ContextualLexer(Lexer): l.lexer = self.lexers[self.parser_state] l.state = self.parser_state + def serialize(self): + return {state: lexer.serialize() for state, lexer in self.lexers.items()} diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index c351ddc..b93592c 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -7,7 +7,7 @@ from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token from .parsers import lalr_parser, earley, xearley, cyk from .tree import Tree -class WithLexer: +class WithLexer(object): lexer = None parser = None lexer_conf = None @@ -36,6 +36,20 @@ class WithLexer: sps = self.lexer.set_parser_state return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else []) + def serialize(self): + return { + # 'class': type(self).__name__, + 'parser': self.parser.serialize(), + 'lexer': self.lexer.serialize(), + } + @classmethod + def deserialize(cls, data): + inst = cls.__new__(cls) + inst.parser = lalr_parser.Parser.deserialize(data['parser']) + inst.lexer = Lexer.deserialize(data['lexer']) + return inst + + class LALR_TraditionalLexer(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): debug = options.debug if options else False diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index c30a92e..4de9496 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -4,10 +4,30 @@ # Email : erezshin@gmail.com from ..exceptions import UnexpectedToken from ..lexer import Token +from ..grammar import Rule -from .lalr_analysis import LALR_Analyzer, Shift +from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable -class Parser: + +class Enumerator: + def __init__(self): + self.enums = {} + + def get(self, item): + if item not in self.enums: + self.enums[item] = len(self.enums) + return self.enums[item] + + def __len__(self): + return len(self.enums) + + def reversed(self): + r = {v: k for k, v in self.enums.items()} + assert len(r) == len(self.enums) + return r + + +class Parser(object): def __init__(self, parser_conf, debug=False): assert all(r.options is None or r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization" @@ -20,6 +40,38 @@ class Parser: self.parser = _Parser(analysis.parse_table, callbacks) self.parse = self.parser.parse + def serialize(self): + tokens = Enumerator() + rules = Enumerator() + + states = { + state: {tokens.get(token): ((1, rules.get(arg)) if action is Reduce else (0, arg)) + for token, (action, arg) in actions.items()} + for state, actions in self._parse_table.states.items() + } + + return { + 'tokens': tokens.reversed(), + 'rules': {idx: r.serialize() for idx, r in rules.reversed().items()}, + 'states': states, + 'start_state': self._parse_table.start_state, + 'end_state': self._parse_table.end_state, + } + + @classmethod + def deserialize(cls, data): + tokens = data['tokens'] + rules = {idx: Rule.deserialize(r) for idx, r in data['rules'].items()} + states = { + state: {tokens[token]: ((Reduce, rules[arg]) if action==1 else (Shift, arg)) + for token, (action, arg) in actions.items()} + for state, actions in data['states'].items() + } + parse_table = IntParseTable(states, data['start_state'], data['end_state']) + print(parse_table) + + + ###{standalone class _Parser: From 335206911d1cc7f9f6eee2bd7946cc2b5d525deb Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 7 Apr 2019 17:47:32 +0300 Subject: [PATCH 02/14] Basic serialize/deserialize working! --- lark/grammar.py | 34 +++++++++++++--- lark/lark.py | 79 ++++++++++++++++++++++++++++++------- lark/lexer.py | 35 +++++++++++++++- lark/parse_tree_builder.py | 4 +- lark/parser_frontends.py | 20 ++++++---- lark/parsers/lalr_parser.py | 7 +++- 6 files changed, 146 insertions(+), 33 deletions(-) diff --git a/lark/grammar.py b/lark/grammar.py index 8691f10..0dc1c21 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -19,6 +19,15 @@ class Symbol(object): fullrepr = property(__repr__) + @classmethod + def deserialize(cls, data): + class_ = { + 'T': Terminal, + 'NT': NonTerminal, + }[data[0]] + return class_(*data[1:]) + + class Terminal(Symbol): is_term = True @@ -71,17 +80,26 @@ class Rule(object): return self.origin == other.origin and self.expansion == other.expansion def serialize(self): - return [self.origin.serialize(), list(s.serialize() for s in self.expansion), self.alias, self.options.serialize() if self.options else None] - # def deserialize(self): - # return [self.origin.serialize(), list(s.serialize() for s in self.expansion), self.alias, self.options.serialize() if self.options else None] + return [self.origin.serialize(), list(s.serialize() for s in self.expansion), self.order, self.alias, self.options.serialize() if self.options else None] + + @classmethod + def deserialize(cls, data): + origin, expansion, order, alias, options = data + return cls( + Symbol.deserialize(origin), + [Symbol.deserialize(s) for s in expansion], + order, + alias, + RuleOptions.deserialize(options) if options else None + ) class RuleOptions: - def __init__(self, keep_all_tokens=False, expand1=False, priority=None): + def __init__(self, keep_all_tokens=False, expand1=False, priority=None, empty_indices=()): self.keep_all_tokens = keep_all_tokens self.expand1 = expand1 self.priority = priority - self.empty_indices = () + self.empty_indices = empty_indices def __repr__(self): return 'RuleOptions(%r, %r, %r)' % ( @@ -91,4 +109,8 @@ class RuleOptions: ) def serialize(self): - return [self.keep_all_tokens, self.expand1, self.priority, list(self.empty_indices)] \ No newline at end of file + return [self.keep_all_tokens, self.expand1, self.priority, list(self.empty_indices)] + + @classmethod + def deserialize(cls, data): + return cls(*data) diff --git a/lark/lark.py b/lark/lark.py index eb73271..9fa5017 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -51,24 +51,39 @@ class LarkOptions(object): if __doc__: __doc__ += OPTIONS_DOC + _defaults = { + 'debug': False, + 'keep_all_tokens': False, + 'tree_class': Tree, + 'cache_grammar': False, + 'postlex': None, + 'parser': 'earley', + 'lexer': 'auto', + 'transformer': None, + 'start': 'start', + 'profile': False, + 'priority': 'auto', + 'ambiguity': 'auto', + 'propagate_positions': False, + 'lexer_callbacks': {}, + 'maybe_placeholders': False, + } + def __init__(self, options_dict): o = dict(options_dict) - self.debug = bool(o.pop('debug', False)) - self.keep_all_tokens = bool(o.pop('keep_all_tokens', False)) - self.tree_class = o.pop('tree_class', Tree) - self.cache_grammar = o.pop('cache_grammar', False) - self.postlex = o.pop('postlex', None) - self.parser = o.pop('parser', 'earley') - self.lexer = o.pop('lexer', 'auto') - self.transformer = o.pop('transformer', None) - self.start = o.pop('start', 'start') - self.profile = o.pop('profile', False) - self.priority = o.pop('priority', 'auto') - self.ambiguity = o.pop('ambiguity', 'auto') - self.propagate_positions = o.pop('propagate_positions', False) - self.lexer_callbacks = o.pop('lexer_callbacks', {}) - self.maybe_placeholders = o.pop('maybe_placeholders', False) + options = {} + for name, default in self._defaults.items(): + if name in o: + value = o.pop(name) + if isinstance(default, bool): + value = bool(value) + else: + value = default + + options[name] = value + + self.__dict__['options'] = options assert self.parser in ('earley', 'lalr', 'cyk', None) @@ -79,6 +94,18 @@ class LarkOptions(object): if o: raise ValueError("Unknown options: %s" % o.keys()) + def __getattr__(self, name): + return self.options[name] + def __setattr__(self, name, value): + self.options[name] = value + + def serialize(self): + return self.options + + @classmethod + def deserialize(cls, data): + return cls(data) + class Profiler: def __init__(self): @@ -208,6 +235,28 @@ class Lark: return self.parser_class(self.lexer_conf, parser_conf, options=self.options) + def serialize(self): + return { + 'parser': self.parser.serialize(), + 'rules': [r.serialize() for r in self.rules], + 'options': self.options.serialize(), + } + + @classmethod + def deserialize(cls, data): + from .grammar import Rule + inst = cls.__new__(cls) + + rules = [Rule.deserialize(r) for r in data['rules']] + options = LarkOptions.deserialize(data['options']) + + ptb = ParseTreeBuilder(rules, options.tree_class, options.propagate_positions, options.keep_all_tokens, options.parser!='lalr' and options.ambiguity=='explicit', options.maybe_placeholders) + callbacks = ptb.create_callback(None) + + parser_class = get_frontend(options.parser, options.lexer) + inst.parser = parser_class.deserialize(data['parser'], callbacks) + return inst + @classmethod def open(cls, grammar_filename, rel_to=None, **options): diff --git a/lark/lexer.py b/lark/lexer.py index 00ff35c..0a7eaed 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -68,6 +68,10 @@ class TerminalDef(object): def serialize(self): return [self.name, self.pattern, self.priority] + @classmethod + def deserialize(cls, data): + return cls(*data) + ###{standalone @@ -268,6 +272,14 @@ class Lexer: set_parser_state = NotImplemented lex = NotImplemented + @classmethod + def deserialize(cls, data): + class_ = { + 'traditional': TraditionalLexer, + 'contextual': ContextualLexer, + }[data['type']] + return class_.deserialize(data) + class TraditionalLexer(Lexer): def __init__(self, terminals, ignore=(), user_callbacks={}): assert all(isinstance(t, TerminalDef) for t in terminals), terminals @@ -312,11 +324,22 @@ class TraditionalLexer(Lexer): def serialize(self): return { + 'type': 'traditional', 'terminals': [t.serialize() for t in self.terminals], 'ignore_types': self.ignore_types, 'newline_types': self.newline_types, } + @classmethod + def deserialize(cls, data): + inst = cls.__new__(cls) + inst.terminals = [TerminalDef.deserialize(t) for t in data['terminals']] + inst.mres = build_mres(inst.terminals) + inst.ignore_types = data['ignore_types'] + inst.newline_types = data['newline_types'] + inst.callback = {} # TODO implement + return inst + class ContextualLexer(Lexer): def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): @@ -354,5 +377,15 @@ class ContextualLexer(Lexer): l.state = self.parser_state def serialize(self): - return {state: lexer.serialize() for state, lexer in self.lexers.items()} + return { + 'type': 'contextual', + 'root_lexer': self.root_lexer.serialize(), + 'lexers': {state: lexer.serialize() for state, lexer in self.lexers.items()} + } + @classmethod + def deserialize(cls, data): + inst = cls.__new__(cls) + inst.lexers = {state:Lexer.deserialize(lexer) for state, lexer in data['lexers'].items()} + inst.root_lexer = TraditionalLexer.deserialize(data['root_lexer']) + return inst \ No newline at end of file diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index ca12d5f..977c371 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -209,12 +209,12 @@ class ParseTreeBuilder: keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) expand_single_child = options.expand1 if options else False - wrapper_chain = filter(None, [ + wrapper_chain = list(filter(None, [ (expand_single_child and not rule.alias) and ExpandSingleChild, maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders and options else None), self.propagate_positions and PropagatePositions, self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens), - ]) + ])) yield rule, wrapper_chain diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index b93592c..e9d3b1b 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -15,11 +15,13 @@ class WithLexer(object): def init_traditional_lexer(self, lexer_conf): self.lexer_conf = lexer_conf self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) + self.postlex = lexer_conf.postlex def init_contextual_lexer(self, lexer_conf): self.lexer_conf = lexer_conf + self.postlex = lexer_conf.postlex states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} - always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () + always_accept = self.postlex.always_accept if self.postlex else () self.lexer = ContextualLexer(lexer_conf.tokens, states, ignore=lexer_conf.ignore, always_accept=always_accept, @@ -27,8 +29,8 @@ class WithLexer(object): def lex(self, text): stream = self.lexer.lex(text) - if self.lexer_conf.postlex: - return self.lexer_conf.postlex.process(stream) + if self.postlex: + return self.postlex.process(stream) return stream def parse(self, text): @@ -38,15 +40,19 @@ class WithLexer(object): def serialize(self): return { - # 'class': type(self).__name__, + 'type': type(self).__name__, 'parser': self.parser.serialize(), 'lexer': self.lexer.serialize(), } @classmethod - def deserialize(cls, data): - inst = cls.__new__(cls) - inst.parser = lalr_parser.Parser.deserialize(data['parser']) + def deserialize(cls, data, callbacks): + class_ = globals()[data['type']] # XXX unsafe + parser = lalr_parser.Parser.deserialize(data['parser'], callbacks) + assert parser + inst = class_.__new__(class_) + inst.parser = parser inst.lexer = Lexer.deserialize(data['lexer']) + inst.postlex = None # TODO return inst diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 4de9496..c943693 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -59,7 +59,7 @@ class Parser(object): } @classmethod - def deserialize(cls, data): + def deserialize(cls, data, callbacks): tokens = data['tokens'] rules = {idx: Rule.deserialize(r) for idx, r in data['rules'].items()} states = { @@ -68,7 +68,10 @@ class Parser(object): for state, actions in data['states'].items() } parse_table = IntParseTable(states, data['start_state'], data['end_state']) - print(parse_table) + inst = cls.__new__(cls) + inst.parser = _Parser(parse_table, callbacks) + inst.parse = inst.parser.parse + return inst From 244f67166575f51cb099ac55b895179b3c2555e8 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 7 Apr 2019 18:15:18 +0300 Subject: [PATCH 03/14] Small refactor --- lark/lark.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index 9fa5017..1ffa0af 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -225,14 +225,14 @@ class Lark: def _build_lexer(self): return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) - def _build_parser(self): + def _prepare_callbacks(self): self.parser_class = get_frontend(self.options.parser, self.options.lexer) - self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit', self.options.maybe_placeholders) - callbacks = self._parse_tree_builder.create_callback(self.options.transformer) - - parser_conf = ParserConf(self.rules, callbacks, self.options.start) + self._callbacks = self._parse_tree_builder.create_callback(self.options.transformer) + def _build_parser(self): + self._prepare_callbacks() + parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) return self.parser_class(self.lexer_conf, parser_conf, options=self.options) def serialize(self): @@ -246,15 +246,10 @@ class Lark: def deserialize(cls, data): from .grammar import Rule inst = cls.__new__(cls) - - rules = [Rule.deserialize(r) for r in data['rules']] - options = LarkOptions.deserialize(data['options']) - - ptb = ParseTreeBuilder(rules, options.tree_class, options.propagate_positions, options.keep_all_tokens, options.parser!='lalr' and options.ambiguity=='explicit', options.maybe_placeholders) - callbacks = ptb.create_callback(None) - - parser_class = get_frontend(options.parser, options.lexer) - inst.parser = parser_class.deserialize(data['parser'], callbacks) + inst.options = LarkOptions.deserialize(data['options']) + inst.rules = [Rule.deserialize(r) for r in data['rules']] + inst._prepare_callbacks() + inst.parser = inst.parser_class.deserialize(data['parser'], inst._callbacks) return inst From 066303fdab25edcca61a1f131c8ba39b2045b6cf Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 8 Apr 2019 15:21:02 +0300 Subject: [PATCH 04/14] Serialized lark is now json compatible --- lark/lark.py | 5 +++-- lark/lexer.py | 21 +++++++++++++++++++-- lark/parser_frontends.py | 5 ++++- lark/parsers/lalr_parser.py | 19 +------------------ lark/utils.py | 19 +++++++++++++++++++ 5 files changed, 46 insertions(+), 23 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index 1ffa0af..abac7fc 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -54,7 +54,7 @@ class LarkOptions(object): _defaults = { 'debug': False, 'keep_all_tokens': False, - 'tree_class': Tree, + 'tree_class': None, 'cache_grammar': False, 'postlex': None, 'parser': 'earley', @@ -97,6 +97,7 @@ class LarkOptions(object): def __getattr__(self, name): return self.options[name] def __setattr__(self, name, value): + assert name in self.options self.options[name] = value def serialize(self): @@ -227,7 +228,7 @@ class Lark: def _prepare_callbacks(self): self.parser_class = get_frontend(self.options.parser, self.options.lexer) - self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit', self.options.maybe_placeholders) + self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class or Tree, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit', self.options.maybe_placeholders) self._callbacks = self._parse_tree_builder.create_callback(self.options.transformer) def _build_parser(self): diff --git a/lark/lexer.py b/lark/lexer.py index 0a7eaed..29ba1f6 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -35,6 +35,16 @@ class Pattern(object): value = ('(?%s)' % f) + value return value + @classmethod + def deserialize(cls, data): + class_ = { + 's': PatternStr, + 're': PatternRE, + }[data[0]] + value, flags = data[1:] + return class_(value, frozenset(flags)) + + class PatternStr(Pattern): def to_regexp(self): return self._get_flags(re.escape(self.value)) @@ -44,6 +54,9 @@ class PatternStr(Pattern): return len(self.value) max_width = min_width + def serialize(self): + return ['s', self.value, list(self.flags)] + class PatternRE(Pattern): def to_regexp(self): return self._get_flags(self.value) @@ -55,6 +68,9 @@ class PatternRE(Pattern): def max_width(self): return get_regexp_width(self.to_regexp())[1] + def serialize(self): + return ['re', self.value, list(self.flags)] + class TerminalDef(object): def __init__(self, name, pattern, priority=1): assert isinstance(pattern, Pattern), pattern @@ -66,11 +82,12 @@ class TerminalDef(object): return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) def serialize(self): - return [self.name, self.pattern, self.priority] + return [self.name, self.pattern.serialize(), self.priority] @classmethod def deserialize(cls, data): - return cls(*data) + name, pattern, priority = data + return cls(name, Pattern.deserialize(pattern), priority) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index e9d3b1b..5a744ea 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -46,7 +46,10 @@ class WithLexer(object): } @classmethod def deserialize(cls, data, callbacks): - class_ = globals()[data['type']] # XXX unsafe + class_ = { + 'LALR_TraditionalLexer': LALR_TraditionalLexer, + 'LALR_ContextualLexer': LALR_ContextualLexer, + }[data['type']] # XXX unsafe parser = lalr_parser.Parser.deserialize(data['parser'], callbacks) assert parser inst = class_.__new__(class_) diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index c943693..a7eebf6 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -5,28 +5,11 @@ from ..exceptions import UnexpectedToken from ..lexer import Token from ..grammar import Rule +from ..utils import Enumerator from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable -class Enumerator: - def __init__(self): - self.enums = {} - - def get(self, item): - if item not in self.enums: - self.enums[item] = len(self.enums) - return self.enums[item] - - def __len__(self): - return len(self.enums) - - def reversed(self): - r = {v: k for k, v in self.enums.items()} - assert len(r) == len(self.enums) - return r - - class Parser(object): def __init__(self, parser_conf, debug=False): assert all(r.options is None or r.options.priority is None diff --git a/lark/utils.py b/lark/utils.py index ea1eb21..d65cac2 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -128,3 +128,22 @@ def get_regexp_width(regexp): return sre_parse.parse(regexp).getwidth() except sre_constants.error: raise ValueError(regexp) + + +class Enumerator: + def __init__(self): + self.enums = {} + + def get(self, item): + if item not in self.enums: + self.enums[item] = len(self.enums) + return self.enums[item] + + def __len__(self): + return len(self.enums) + + def reversed(self): + r = {v: k for k, v in self.enums.items()} + assert len(r) == len(self.enums) + return r + From dd84f6c0a87d945f9e0a7ddc65132a2f38076dd7 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 8 Apr 2019 16:09:33 +0300 Subject: [PATCH 05/14] Tiny refactor --- lark/parser_frontends.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 5a744ea..8ef3c8e 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -29,9 +29,7 @@ class WithLexer(object): def lex(self, text): stream = self.lexer.lex(text) - if self.postlex: - return self.postlex.process(stream) - return stream + return self.postlex.process(stream) if self.postlex else stream def parse(self, text): token_stream = self.lex(text) @@ -49,7 +47,7 @@ class WithLexer(object): class_ = { 'LALR_TraditionalLexer': LALR_TraditionalLexer, 'LALR_ContextualLexer': LALR_ContextualLexer, - }[data['type']] # XXX unsafe + }[data['type']] parser = lalr_parser.Parser.deserialize(data['parser'], callbacks) assert parser inst = class_.__new__(class_) From d13ebb9c15a16ac31449827bcf868a61bd4fcc1e Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 8 Apr 2019 20:29:01 +0300 Subject: [PATCH 06/14] Using a mostly-generic serialization method --- lark/grammar.py | 52 +++++++--------------- lark/lark.py | 17 +++---- lark/lexer.py | 83 +++++++++-------------------------- lark/parser_frontends.py | 38 +++++++--------- lark/parsers/lalr_analysis.py | 32 +++++++++++++- lark/parsers/lalr_parser.py | 38 +++------------- lark/utils.py | 48 ++++++++++++++++++++ 7 files changed, 145 insertions(+), 163 deletions(-) diff --git a/lark/grammar.py b/lark/grammar.py index 0dc1c21..adde150 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -1,4 +1,6 @@ -class Symbol(object): +from .utils import Serialize + +class Symbol(Serialize): is_term = NotImplemented def __init__(self, name): @@ -19,16 +21,10 @@ class Symbol(object): fullrepr = property(__repr__) - @classmethod - def deserialize(cls, data): - class_ = { - 'T': Terminal, - 'NT': NonTerminal, - }[data[0]] - return class_(*data[1:]) - class Terminal(Symbol): + __serialize_fields__ = 'name', 'filter_out' + is_term = True def __init__(self, name, filter_out=False): @@ -39,23 +35,25 @@ class Terminal(Symbol): def fullrepr(self): return '%s(%r, %r)' % (type(self).__name__, self.name, self.filter_out) - def serialize(self): - return ['T', self.name, self.filter_out] class NonTerminal(Symbol): + __serialize_fields__ = 'name', + is_term = False - def serialize(self): - return ['NT', self.name] -class Rule(object): +class Rule(Serialize): """ origin : a symbol expansion : a list of symbols order : index of this expansion amongst all rules of the same name """ __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash') + + __serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options' + __serialize_namespace__ = lambda: (Terminal, NonTerminal, RuleOptions) + def __init__(self, origin, expansion, order=0, alias=None, options=None): self.origin = origin self.expansion = expansion @@ -64,6 +62,8 @@ class Rule(object): self.options = options self._hash = hash((self.origin, tuple(self.expansion))) + def _deserialize(self): + self._hash = hash((self.origin, tuple(self.expansion))) def __str__(self): return '<%s : %s>' % (self.origin.name, ' '.join(x.name for x in self.expansion)) @@ -79,22 +79,11 @@ class Rule(object): return False return self.origin == other.origin and self.expansion == other.expansion - def serialize(self): - return [self.origin.serialize(), list(s.serialize() for s in self.expansion), self.order, self.alias, self.options.serialize() if self.options else None] - - @classmethod - def deserialize(cls, data): - origin, expansion, order, alias, options = data - return cls( - Symbol.deserialize(origin), - [Symbol.deserialize(s) for s in expansion], - order, - alias, - RuleOptions.deserialize(options) if options else None - ) -class RuleOptions: +class RuleOptions(Serialize): + __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'empty_indices' + def __init__(self, keep_all_tokens=False, expand1=False, priority=None, empty_indices=()): self.keep_all_tokens = keep_all_tokens self.expand1 = expand1 @@ -107,10 +96,3 @@ class RuleOptions: self.expand1, self.priority, ) - - def serialize(self): - return [self.keep_all_tokens, self.expand1, self.priority, list(self.empty_indices)] - - @classmethod - def deserialize(cls, data): - return cls(*data) diff --git a/lark/lark.py b/lark/lark.py index abac7fc..3d81b21 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -5,7 +5,7 @@ import time from collections import defaultdict from io import open -from .utils import STRING_TYPE +from .utils import STRING_TYPE, Serialize from .load_grammar import load_grammar from .tree import Tree from .common import LexerConf, ParserConf @@ -13,9 +13,10 @@ from .common import LexerConf, ParserConf from .lexer import Lexer, TraditionalLexer from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import get_frontend +from .grammar import Rule -class LarkOptions(object): +class LarkOptions(Serialize): """Specifies the options for Lark """ @@ -132,7 +133,7 @@ class Profiler: return wrapper -class Lark: +class Lark(Serialize): def __init__(self, grammar, **options): """ grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax) @@ -223,6 +224,8 @@ class Lark: if __init__.__doc__: __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC + __serialize_fields__ = 'parser', 'rules', 'options' + def _build_lexer(self): return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) @@ -236,16 +239,8 @@ class Lark: parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) return self.parser_class(self.lexer_conf, parser_conf, options=self.options) - def serialize(self): - return { - 'parser': self.parser.serialize(), - 'rules': [r.serialize() for r in self.rules], - 'options': self.options.serialize(), - } - @classmethod def deserialize(cls, data): - from .grammar import Rule inst = cls.__new__(cls) inst.options = LarkOptions.deserialize(data['options']) inst.rules = [Rule.deserialize(r) for r in data['rules']] diff --git a/lark/lexer.py b/lark/lexer.py index 29ba1f6..ae370dc 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -2,10 +2,12 @@ import re -from .utils import Str, classify, get_regexp_width, Py36 +from .utils import Str, classify, get_regexp_width, Py36, Serialize from .exceptions import UnexpectedCharacters, LexError -class Pattern(object): +class Pattern(Serialize): + __serialize_fields__ = 'value', 'flags' + def __init__(self, value, flags=()): self.value = value self.flags = frozenset(flags) @@ -35,15 +37,6 @@ class Pattern(object): value = ('(?%s)' % f) + value return value - @classmethod - def deserialize(cls, data): - class_ = { - 's': PatternStr, - 're': PatternRE, - }[data[0]] - value, flags = data[1:] - return class_(value, frozenset(flags)) - class PatternStr(Pattern): def to_regexp(self): @@ -54,9 +47,6 @@ class PatternStr(Pattern): return len(self.value) max_width = min_width - def serialize(self): - return ['s', self.value, list(self.flags)] - class PatternRE(Pattern): def to_regexp(self): return self._get_flags(self.value) @@ -68,10 +58,11 @@ class PatternRE(Pattern): def max_width(self): return get_regexp_width(self.to_regexp())[1] - def serialize(self): - return ['re', self.value, list(self.flags)] -class TerminalDef(object): +class TerminalDef(Serialize): + __serialize_fields__ = 'name', 'pattern', 'priority' + __serialize_namespace__ = lambda: (PatternStr, PatternRE) + def __init__(self, name, pattern, priority=1): assert isinstance(pattern, Pattern), pattern self.name = name @@ -81,14 +72,6 @@ class TerminalDef(object): def __repr__(self): return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) - def serialize(self): - return [self.name, self.pattern.serialize(), self.priority] - - @classmethod - def deserialize(cls, data): - name, pattern, priority = data - return cls(name, Pattern.deserialize(pattern), priority) - ###{standalone @@ -278,7 +261,7 @@ def _regexp_has_newline(r): """ return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r) -class Lexer: +class Lexer(Serialize): """Lexer interface Method Signatures: @@ -289,15 +272,16 @@ class Lexer: set_parser_state = NotImplemented lex = NotImplemented - @classmethod - def deserialize(cls, data): - class_ = { - 'traditional': TraditionalLexer, - 'contextual': ContextualLexer, - }[data['type']] - return class_.deserialize(data) class TraditionalLexer(Lexer): + __serialize_fields__ = 'terminals', 'ignore_types', 'newline_types' + __serialize_namespace__ = lambda: (TerminalDef,) + + def _deserialize(self): + self.mres = build_mres(self.terminals) + self.callback = {} # TODO implement + + def __init__(self, terminals, ignore=(), user_callbacks={}): assert all(isinstance(t, TerminalDef) for t in terminals), terminals @@ -339,26 +323,13 @@ class TraditionalLexer(Lexer): def lex(self, stream): return _Lex(self).lex(stream, self.newline_types, self.ignore_types) - def serialize(self): - return { - 'type': 'traditional', - 'terminals': [t.serialize() for t in self.terminals], - 'ignore_types': self.ignore_types, - 'newline_types': self.newline_types, - } - @classmethod - def deserialize(cls, data): - inst = cls.__new__(cls) - inst.terminals = [TerminalDef.deserialize(t) for t in data['terminals']] - inst.mres = build_mres(inst.terminals) - inst.ignore_types = data['ignore_types'] - inst.newline_types = data['newline_types'] - inst.callback = {} # TODO implement - return inst class ContextualLexer(Lexer): + __serialize_fields__ = 'root_lexer', 'lexers' + __serialize_namespace__ = lambda: (TraditionalLexer,) + def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): tokens_by_name = {} for t in terminals: @@ -392,17 +363,3 @@ class ContextualLexer(Lexer): yield x l.lexer = self.lexers[self.parser_state] l.state = self.parser_state - - def serialize(self): - return { - 'type': 'contextual', - 'root_lexer': self.root_lexer.serialize(), - 'lexers': {state: lexer.serialize() for state, lexer in self.lexers.items()} - } - - @classmethod - def deserialize(cls, data): - inst = cls.__new__(cls) - inst.lexers = {state:Lexer.deserialize(lexer) for state, lexer in data['lexers'].items()} - inst.root_lexer = TraditionalLexer.deserialize(data['root_lexer']) - return inst \ No newline at end of file diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 8ef3c8e..7c47173 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,17 +1,31 @@ import re from functools import partial -from .utils import get_regexp_width +from .utils import get_regexp_width, Serialize from .parsers.grammar_analysis import GrammarAnalyzer from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token from .parsers import lalr_parser, earley, xearley, cyk +from .grammar import Rule from .tree import Tree -class WithLexer(object): +class WithLexer(Serialize): lexer = None parser = None lexer_conf = None + __serialize_fields__ = 'parser', 'lexer' + __serialize_namespace__ = lambda: (Rule, ContextualLexer, LALR_ContextualLexer) + + @classmethod + def deserialize(cls, data, callbacks): + inst = super(WithLexer, cls).deserialize(data) + inst.postlex = None # TODO + inst.parser = lalr_parser.Parser.deserialize(inst.parser, callbacks) + return inst + + def _serialize(self, data): + data['parser'] = data['parser'].serialize() + def init_traditional_lexer(self, lexer_conf): self.lexer_conf = lexer_conf self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) @@ -36,26 +50,6 @@ class WithLexer(object): sps = self.lexer.set_parser_state return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else []) - def serialize(self): - return { - 'type': type(self).__name__, - 'parser': self.parser.serialize(), - 'lexer': self.lexer.serialize(), - } - @classmethod - def deserialize(cls, data, callbacks): - class_ = { - 'LALR_TraditionalLexer': LALR_TraditionalLexer, - 'LALR_ContextualLexer': LALR_ContextualLexer, - }[data['type']] - parser = lalr_parser.Parser.deserialize(data['parser'], callbacks) - assert parser - inst = class_.__new__(class_) - inst.parser = parser - inst.lexer = Lexer.deserialize(data['lexer']) - inst.postlex = None # TODO - return inst - class LALR_TraditionalLexer(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 950fb27..3dced8e 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -9,7 +9,7 @@ For now, shift/reduce conflicts are automatically resolved as shifts. import logging from collections import defaultdict -from ..utils import classify, classify_bool, bfs, fzset +from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator from ..exceptions import GrammarError from .grammar_analysis import GrammarAnalyzer, Terminal @@ -31,6 +31,36 @@ class ParseTable: self.start_state = start_state self.end_state = end_state + def serialize(self): + tokens = Enumerator() + rules = Enumerator() + + states = { + state: {tokens.get(token): ((1, rules.get(arg)) if action is Reduce else (0, arg)) + for token, (action, arg) in actions.items()} + for state, actions in self.states.items() + } + + return { + 'tokens': tokens.reversed(), + 'rules': {idx: r.serialize() for idx, r in rules.reversed().items()}, + 'states': states, + 'start_state': self.start_state, + 'end_state': self.end_state, + } + + @classmethod + def deserialize(cls, data): + tokens = data['tokens'] + rules = data['rules'] + states = { + state: {tokens[token]: ((Reduce, rules[arg]) if action==1 else (Shift, arg)) + for token, (action, arg) in actions.items()} + for state, actions in data['states'].items() + } + return cls(states, data['start_state'], data['end_state']) + + class IntParseTable(ParseTable): @classmethod diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index a7eebf6..701d158 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -5,12 +5,12 @@ from ..exceptions import UnexpectedToken from ..lexer import Token from ..grammar import Rule -from ..utils import Enumerator +from ..utils import Enumerator, Serialize from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable -class Parser(object): +class Parser: def __init__(self, parser_conf, debug=False): assert all(r.options is None or r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization" @@ -21,42 +21,18 @@ class Parser(object): self._parse_table = analysis.parse_table self.parser_conf = parser_conf self.parser = _Parser(analysis.parse_table, callbacks) - self.parse = self.parser.parse - def serialize(self): - tokens = Enumerator() - rules = Enumerator() - - states = { - state: {tokens.get(token): ((1, rules.get(arg)) if action is Reduce else (0, arg)) - for token, (action, arg) in actions.items()} - for state, actions in self._parse_table.states.items() - } - - return { - 'tokens': tokens.reversed(), - 'rules': {idx: r.serialize() for idx, r in rules.reversed().items()}, - 'states': states, - 'start_state': self._parse_table.start_state, - 'end_state': self._parse_table.end_state, - } - @classmethod def deserialize(cls, data, callbacks): - tokens = data['tokens'] - rules = {idx: Rule.deserialize(r) for idx, r in data['rules'].items()} - states = { - state: {tokens[token]: ((Reduce, rules[arg]) if action==1 else (Shift, arg)) - for token, (action, arg) in actions.items()} - for state, actions in data['states'].items() - } - parse_table = IntParseTable(states, data['start_state'], data['end_state']) inst = cls.__new__(cls) - inst.parser = _Parser(parse_table, callbacks) - inst.parse = inst.parser.parse + inst.parser = _Parser(IntParseTable.deserialize(data), callbacks) return inst + def serialize(self): + return self._parse_table.serialize() + def parse(self, *args): + return self.parser.parse(*args) ###{standalone diff --git a/lark/utils.py b/lark/utils.py index d65cac2..3dda697 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -44,6 +44,54 @@ def bfs(initial, expand): +def _serialize(value): + if isinstance(value, Serialize): + return value.serialize() + elif isinstance(value, list): + return [_serialize(elem) for elem in value] + elif isinstance(value, frozenset): + return list(value) # TODO reversible? + elif isinstance(value, dict): + return {key:_serialize(elem) for key, elem in value.items()} + return value + +def _deserialize(data, namespace): + if isinstance(data, dict): + if '__type__' in data: # Object + class_ = namespace[data['__type__']] + return class_.deserialize(data) + return {key:_deserialize(value, namespace) for key, value in data.items()} + elif isinstance(data, list): + return [_deserialize(value, namespace) for value in data] + return data + + +class Serialize(object): + def serialize(self): + fields = getattr(self, '__serialize_fields__') + res = {f: _serialize(getattr(self, f)) for f in fields} + res['__type__'] = type(self).__name__ + postprocess = getattr(self, '_serialize', None) + if postprocess: + postprocess(res) + return res + + @classmethod + def deserialize(cls, data): + namespace = getattr(cls, '__serialize_namespace__', dict) + namespace = {c.__name__:c for c in namespace()} + + fields = getattr(cls, '__serialize_fields__') + + inst = cls.__new__(cls) + for f in fields: + setattr(inst, f, _deserialize(data[f], namespace)) + postprocess = getattr(inst, '_deserialize', None) + if postprocess: + postprocess() + return inst + + ###{standalone try: From e52cc46fc5ce4155e46ba94fe91140fe8a3cf46d Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 8 Apr 2019 22:09:22 +0300 Subject: [PATCH 07/14] Minor cleanup --- lark/parsers/lalr_parser.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 701d158..987ef34 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -4,10 +4,9 @@ # Email : erezshin@gmail.com from ..exceptions import UnexpectedToken from ..lexer import Token -from ..grammar import Rule from ..utils import Enumerator, Serialize -from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable +from .lalr_analysis import LALR_Analyzer, Shift, IntParseTable class Parser: From 94e15fb6f7a737fed15c98d9e3b1cb8f2cefafe0 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 9 Apr 2019 11:15:50 +0300 Subject: [PATCH 08/14] Mid work. Almost stable --- lark/grammar.py | 38 ++++---- lark/lark.py | 25 ++---- lark/lexer.py | 12 +-- lark/parser_frontends.py | 23 +++-- lark/parsers/lalr_analysis.py | 7 +- lark/parsers/lalr_parser.py | 8 +- lark/tools/standalone.py | 158 ++++++---------------------------- lark/utils.py | 70 +++++++++++---- 8 files changed, 134 insertions(+), 207 deletions(-) diff --git a/lark/grammar.py b/lark/grammar.py index adde150..14893fb 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -1,5 +1,7 @@ from .utils import Serialize +###{standalone + class Symbol(Serialize): is_term = NotImplemented @@ -43,6 +45,24 @@ class NonTerminal(Symbol): is_term = False + +class RuleOptions(Serialize): + __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'empty_indices' + + def __init__(self, keep_all_tokens=False, expand1=False, priority=None, empty_indices=()): + self.keep_all_tokens = keep_all_tokens + self.expand1 = expand1 + self.priority = priority + self.empty_indices = empty_indices + + def __repr__(self): + return 'RuleOptions(%r, %r, %r)' % ( + self.keep_all_tokens, + self.expand1, + self.priority, + ) + + class Rule(Serialize): """ origin : a symbol @@ -52,7 +72,7 @@ class Rule(Serialize): __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash') __serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options' - __serialize_namespace__ = lambda: (Terminal, NonTerminal, RuleOptions) + __serialize_namespace__ = Terminal, NonTerminal, RuleOptions def __init__(self, origin, expansion, order=0, alias=None, options=None): self.origin = origin @@ -81,18 +101,4 @@ class Rule(Serialize): -class RuleOptions(Serialize): - __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'empty_indices' - - def __init__(self, keep_all_tokens=False, expand1=False, priority=None, empty_indices=()): - self.keep_all_tokens = keep_all_tokens - self.expand1 = expand1 - self.priority = priority - self.empty_indices = empty_indices - - def __repr__(self): - return 'RuleOptions(%r, %r, %r)' % ( - self.keep_all_tokens, - self.expand1, - self.priority, - ) +###} \ No newline at end of file diff --git a/lark/lark.py b/lark/lark.py index 3d81b21..1309c60 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -15,6 +15,7 @@ from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import get_frontend from .grammar import Rule +###{standalone class LarkOptions(Serialize): """Specifies the options for Lark @@ -101,11 +102,11 @@ class LarkOptions(Serialize): assert name in self.options self.options[name] = value - def serialize(self): + def serialize(self, memo): return self.options @classmethod - def deserialize(cls, data): + def deserialize(cls, data, memo): return cls(data) @@ -240,12 +241,12 @@ class Lark(Serialize): return self.parser_class(self.lexer_conf, parser_conf, options=self.options) @classmethod - def deserialize(cls, data): + def deserialize(cls, data, memo): inst = cls.__new__(cls) - inst.options = LarkOptions.deserialize(data['options']) - inst.rules = [Rule.deserialize(r) for r in data['rules']] + inst.options = LarkOptions.deserialize(data['options'], memo) + inst.rules = [Rule.deserialize(r, memo) for r in data['rules']] inst._prepare_callbacks() - inst.parser = inst.parser_class.deserialize(data['parser'], inst._callbacks) + inst.parser = inst.parser_class.deserialize(data['parser'], memo, inst._callbacks) return inst @@ -284,14 +285,4 @@ class Lark(Serialize): "Parse the given text, according to the options provided. Returns a tree, unless specified otherwise." return self.parser.parse(text) - # if self.profiler: - # self.profiler.enter_section('lex') - # l = list(self.lex(text)) - # self.profiler.enter_section('parse') - # try: - # return self.parser.parse(l) - # finally: - # self.profiler.enter_section('outside_lark') - # else: - # l = list(self.lex(text)) - # return self.parser.parse(l) +###} \ No newline at end of file diff --git a/lark/lexer.py b/lark/lexer.py index ae370dc..080770d 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -5,6 +5,8 @@ import re from .utils import Str, classify, get_regexp_width, Py36, Serialize from .exceptions import UnexpectedCharacters, LexError +###{standalone + class Pattern(Serialize): __serialize_fields__ = 'value', 'flags' @@ -61,7 +63,7 @@ class PatternRE(Pattern): class TerminalDef(Serialize): __serialize_fields__ = 'name', 'pattern', 'priority' - __serialize_namespace__ = lambda: (PatternStr, PatternRE) + __serialize_namespace__ = PatternStr, PatternRE def __init__(self, name, pattern, priority=1): assert isinstance(pattern, Pattern), pattern @@ -74,7 +76,6 @@ class TerminalDef(Serialize): -###{standalone class Token(Str): __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column') @@ -205,7 +206,6 @@ class CallChain: return self.callback2(t) if self.cond(t2) else t2 -###} @@ -275,7 +275,7 @@ class Lexer(Serialize): class TraditionalLexer(Lexer): __serialize_fields__ = 'terminals', 'ignore_types', 'newline_types' - __serialize_namespace__ = lambda: (TerminalDef,) + __serialize_namespace__ = TerminalDef, def _deserialize(self): self.mres = build_mres(self.terminals) @@ -328,7 +328,7 @@ class TraditionalLexer(Lexer): class ContextualLexer(Lexer): __serialize_fields__ = 'root_lexer', 'lexers' - __serialize_namespace__ = lambda: (TraditionalLexer,) + __serialize_namespace__ = TraditionalLexer, def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): tokens_by_name = {} @@ -363,3 +363,5 @@ class ContextualLexer(Lexer): yield x l.lexer = self.lexers[self.parser_state] l.state = self.parser_state + +###} \ No newline at end of file diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 7c47173..6750480 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -4,26 +4,29 @@ from functools import partial from .utils import get_regexp_width, Serialize from .parsers.grammar_analysis import GrammarAnalyzer from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token -from .parsers import lalr_parser, earley, xearley, cyk +from .parsers import earley, xearley, cyk +from .parsers.lalr_parser import LALR_Parser from .grammar import Rule from .tree import Tree +###{standalone + class WithLexer(Serialize): lexer = None parser = None lexer_conf = None __serialize_fields__ = 'parser', 'lexer' - __serialize_namespace__ = lambda: (Rule, ContextualLexer, LALR_ContextualLexer) + __serialize_namespace__ = Rule, ContextualLexer @classmethod - def deserialize(cls, data, callbacks): - inst = super(WithLexer, cls).deserialize(data) + def deserialize(cls, data, memo, callbacks): + inst = super(WithLexer, cls).deserialize(data, memo) inst.postlex = None # TODO - inst.parser = lalr_parser.Parser.deserialize(inst.parser, callbacks) + inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) return inst - def _serialize(self, data): + def _serialize(self, data, memo): data['parser'] = data['parser'].serialize() def init_traditional_lexer(self, lexer_conf): @@ -54,18 +57,18 @@ class WithLexer(Serialize): class LALR_TraditionalLexer(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): debug = options.debug if options else False - self.parser = lalr_parser.Parser(parser_conf, debug=debug) + self.parser = LALR_Parser(parser_conf, debug=debug) self.init_traditional_lexer(lexer_conf) class LALR_ContextualLexer(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): debug = options.debug if options else False - self.parser = lalr_parser.Parser(parser_conf, debug=debug) + self.parser = LALR_Parser(parser_conf, debug=debug) self.init_contextual_lexer(lexer_conf) class LALR_CustomLexer(WithLexer): def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): - self.parser = lalr_parser.Parser(parser_conf) + self.parser = LALR_Parser(parser_conf) self.lexer_conf = lexer_conf self.lexer = lexer_cls(lexer_conf) @@ -190,3 +193,5 @@ def get_frontend(parser, lexer): + +###} \ No newline at end of file diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 3dced8e..e34b8c3 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -14,6 +14,8 @@ from ..exceptions import GrammarError from .grammar_analysis import GrammarAnalyzer, Terminal +###{standalone + class Action: def __init__(self, name): self.name = name @@ -50,7 +52,7 @@ class ParseTable: } @classmethod - def deserialize(cls, data): + def deserialize(cls, data, memo): tokens = data['tokens'] rules = data['rules'] states = { @@ -79,8 +81,7 @@ class IntParseTable(ParseTable): end_state = state_to_idx[parse_table.end_state] return cls(int_states, start_state, end_state) - - +###} class LALR_Analyzer(GrammarAnalyzer): diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 987ef34..241a47e 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -9,7 +9,8 @@ from ..utils import Enumerator, Serialize from .lalr_analysis import LALR_Analyzer, Shift, IntParseTable -class Parser: +###{standalone +class LALR_Parser(object): def __init__(self, parser_conf, debug=False): assert all(r.options is None or r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization" @@ -22,9 +23,9 @@ class Parser: self.parser = _Parser(analysis.parse_table, callbacks) @classmethod - def deserialize(cls, data, callbacks): + def deserialize(cls, data, memo, callbacks): inst = cls.__new__(cls) - inst.parser = _Parser(IntParseTable.deserialize(data), callbacks) + inst.parser = _Parser(IntParseTable.deserialize(data, memo), callbacks) return inst def serialize(self): @@ -33,7 +34,6 @@ class Parser: def parse(self, *args): return self.parser.parse(*args) -###{standalone class _Parser: def __init__(self, parse_table, callbacks): diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 3e6cc5d..99e1929 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -36,6 +36,7 @@ # ###} +import pprint import codecs import sys import os @@ -47,6 +48,10 @@ import lark from lark import Lark from lark.parsers.lalr_analysis import Reduce + +from lark.grammar import RuleOptions +from lark.lexer import TerminalDef + _dir = path.dirname(__file__) _larkdir = path.join(_dir, path.pardir) @@ -61,9 +66,12 @@ EXTRACT_STANDALONE_FILES = [ 'lexer.py', 'parse_tree_builder.py', 'parsers/lalr_parser.py', + 'parsers/lalr_analysis.py', + 'parser_frontends.py', + 'lark.py', + 'grammar.py', ] - def extract_sections(lines): section = None text = [] @@ -83,152 +91,34 @@ def extract_sections(lines): return {name:''.join(text) for name, text in sections.items()} -def _prepare_mres(mres): - return [(p.pattern,{i: t for i, t in d.items()}) for p,d in mres] - -class TraditionalLexerAtoms: - def __init__(self, lexer): - self.mres = _prepare_mres(lexer.mres) - self.newline_types = lexer.newline_types - self.ignore_types = lexer.ignore_types - self.callback = {name:_prepare_mres(c.mres) - for name, c in lexer.callback.items()} - - def print_python(self): - print('import re') - print('class LexerRegexps: pass') - print('NEWLINE_TYPES = %s' % self.newline_types) - print('IGNORE_TYPES = %s' % self.ignore_types) - self._print_python('lexer') - - def _print_python(self, var_name): - print('MRES = (') - pprint(self.mres) - print(')') - print('LEXER_CALLBACK = (') - pprint(self.callback) - print(')') - print('lexer_regexps = LexerRegexps()') - print('lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES]') - print('lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres])') - print(' for n, mres in LEXER_CALLBACK.items()}') - print('%s = (lexer_regexps)' % var_name) - - -class ContextualLexerAtoms: - def __init__(self, lexer): - self.lexer_atoms = {state: TraditionalLexerAtoms(lexer) for state, lexer in lexer.lexers.items()} - self.root_lexer_atoms = TraditionalLexerAtoms(lexer.root_lexer) - - def print_python(self): - print('import re') - print('class LexerRegexps: pass') - print('NEWLINE_TYPES = %s' % self.root_lexer_atoms.newline_types) - print('IGNORE_TYPES = %s' % self.root_lexer_atoms.ignore_types) - - print('LEXERS = {}') - for state, lexer_atoms in self.lexer_atoms.items(): - lexer_atoms._print_python('LEXERS[%d]' % state) - - print('class ContextualLexer:') - print(' def __init__(self):') - print(' self.lexers = LEXERS') - print(' self.set_parser_state(None)') - print(' def set_parser_state(self, state):') - print(' self.parser_state = state') - print(' def lex(self, stream):') - print(' newline_types = NEWLINE_TYPES') - print(' ignore_types = IGNORE_TYPES') - print(' lexers = LEXERS') - print(' l = _Lex(lexers[self.parser_state], self.parser_state)') - print(' for x in l.lex(stream, newline_types, ignore_types):') - print(' yield x') - print(' l.lexer = lexers[self.parser_state]') - print(' l.state = self.parser_state') - - print('CON_LEXER = ContextualLexer()') - print('def lex(stream):') - print(' return CON_LEXER.lex(stream)') - -class GetRule: - def __init__(self, rule_id): - self.rule_id = rule_id - - def __repr__(self): - return 'RULES[%d]' % self.rule_id - -rule_ids = {} -token_types = {} - -def _get_token_type(token_type): - if token_type not in token_types: - token_types[token_type] = len(token_types) - return token_types[token_type] - -class ParserAtoms: - def __init__(self, parser): - self.parse_table = parser._parse_table - - def print_python(self): - print('class ParseTable: pass') - print('parse_table = ParseTable()') - print('STATES = {') - for state, actions in self.parse_table.states.items(): - print(' %r: %r,' % (state, {_get_token_type(token): ((1, rule_ids[arg]) if action is Reduce else (0, arg)) - for token, (action, arg) in actions.items()})) - print('}') - print('TOKEN_TYPES = (') - pprint({v:k for k, v in token_types.items()}) - print(')') - print('parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()}') - print(' for s, acts in STATES.items()}') - print('parse_table.start_state = %s' % self.parse_table.start_state) - print('parse_table.end_state = %s' % self.parse_table.end_state) - print('class Lark_StandAlone:') - print(' def __init__(self, transformer=None, postlex=None):') - print(' callbacks = parse_tree_builder.create_callback(transformer=transformer)') - print(' self.parser = _Parser(parse_table, callbacks)') - print(' self.postlex = postlex') - print(' def parse(self, stream):') - print(' tokens = lex(stream)') - print(' sps = CON_LEXER.set_parser_state') - print(' if self.postlex: tokens = self.postlex.process(tokens)') - print(' return self.parser.parse(tokens, sps)') - -class TreeBuilderAtoms: - def __init__(self, lark): - self.rules = lark.rules - - def print_python(self): - # print('class InlineTransformer: pass') - print('RULES = {') - for i, r in enumerate(self.rules): - rule_ids[r] = i - print(' %d: Rule(%r, [%s], alias=%r, options=%r),' % (i, r.origin, ', '.join(s.fullrepr for s in r.expansion), r.alias, r.options )) - print('}') - print('parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree)') def main(fobj, start): lark_inst = Lark(fobj, parser="lalr", lexer="contextual", start=start) - lexer_atoms = ContextualLexerAtoms(lark_inst.parser.lexer) - parser_atoms = ParserAtoms(lark_inst.parser.parser) - tree_builder_atoms = TreeBuilderAtoms(lark_inst) - print('# The file was automatically generated by Lark v%s' % lark.__version__) for pyfile in EXTRACT_STANDALONE_FILES: with open(os.path.join(_larkdir, pyfile)) as f: print (extract_sections(f)['standalone']) - with open(os.path.join(_larkdir, 'grammar.py')) as grammar_py: - print(grammar_py.read()) + data, m = lark_inst.memo_serialize([TerminalDef]) + print( 'DATA = (' ) + # pprint(data, width=160) + print(data) + print(')') + print( 'MEMO = (') + print(m) + print(')') + print('Shift = 0') print('Reduce = 1') - lexer_atoms.print_python() - tree_builder_atoms.print_python() - parser_atoms.print_python() + print("def load_parser():") + print(" return Lark.deserialize(DATA)") + + + + if __name__ == '__main__': if len(sys.argv) < 2: diff --git a/lark/utils.py b/lark/utils.py index 3dda697..0849745 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -1,8 +1,6 @@ import sys from collections import deque -Py36 = (sys.version_info[:2] >= (3, 6)) - class fzset(frozenset): def __repr__(self): return '{%s}' % ', '.join(map(repr, self)) @@ -44,56 +42,90 @@ def bfs(initial, expand): -def _serialize(value): +###{standalone +import sys, re + +Py36 = (sys.version_info[:2] >= (3, 6)) + + + +def _serialize(value, memo): if isinstance(value, Serialize): - return value.serialize() + return value.serialize(memo) elif isinstance(value, list): - return [_serialize(elem) for elem in value] + return [_serialize(elem, memo) for elem in value] elif isinstance(value, frozenset): return list(value) # TODO reversible? elif isinstance(value, dict): - return {key:_serialize(elem) for key, elem in value.items()} + return {key:_serialize(elem, memo) for key, elem in value.items()} return value -def _deserialize(data, namespace): +def _deserialize(data, namespace, memo): if isinstance(data, dict): if '__type__' in data: # Object class_ = namespace[data['__type__']] - return class_.deserialize(data) - return {key:_deserialize(value, namespace) for key, value in data.items()} + return class_.deserialize(data, memo) + return {key:_deserialize(value, namespace, memo) for key, value in data.items()} elif isinstance(data, list): - return [_deserialize(value, namespace) for value in data] + return [_deserialize(value, namespace, memo) for value in data] return data class Serialize(object): - def serialize(self): + def memo_serialize(self, types_to_memoize): + memo = SerializeMemoizer(types_to_memoize) + return self.serialize(memo), memo.serialize() + + def serialize(self, memo=None): + if memo and memo.in_types(self): + return {'__memo__': memo.memoized.get(self)} + fields = getattr(self, '__serialize_fields__') - res = {f: _serialize(getattr(self, f)) for f in fields} + res = {f: _serialize(getattr(self, f), memo) for f in fields} res['__type__'] = type(self).__name__ postprocess = getattr(self, '_serialize', None) if postprocess: - postprocess(res) + postprocess(res, memo) return res @classmethod - def deserialize(cls, data): - namespace = getattr(cls, '__serialize_namespace__', dict) - namespace = {c.__name__:c for c in namespace()} + def deserialize(cls, data, memo): + namespace = getattr(cls, '__serialize_namespace__', {}) + namespace = {c.__name__:c for c in namespace} fields = getattr(cls, '__serialize_fields__') + if '__memo__' in data: + return memo[data['__memo__']] + inst = cls.__new__(cls) for f in fields: - setattr(inst, f, _deserialize(data[f], namespace)) + setattr(inst, f, _deserialize(data[f], namespace, memo)) postprocess = getattr(inst, '_deserialize', None) if postprocess: postprocess() return inst +class SerializeMemoizer(Serialize): + __serialize_fields__ = 'memoized', + + def __init__(self, types_to_memoize): + self.types_to_memoize = tuple(types_to_memoize) + self.memoized = Enumerator() + + def in_types(self, value): + return isinstance(value, self.types_to_memoize) + + def serialize(self): + return _serialize(self.memoized.reversed(), None) + + @classmethod + def deserialize(cls, data, namespace, memo): + return _deserialize(data, namespace, memo) + + -###{standalone try: STRING_TYPE = basestring except NameError: # Python 3 @@ -178,7 +210,7 @@ def get_regexp_width(regexp): raise ValueError(regexp) -class Enumerator: +class Enumerator(Serialize): def __init__(self): self.enums = {} From 3c64c56bcdcc76d81be065e7ef421bf94285fb98 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 9 Apr 2019 11:59:26 +0300 Subject: [PATCH 09/14] All tests passing --- lark/parser_frontends.py | 2 +- lark/parsers/lalr_analysis.py | 9 ++++----- lark/parsers/lalr_parser.py | 4 ++-- lark/tools/standalone.py | 13 ++++++------- lark/utils.py | 19 ++++++++++++------- 5 files changed, 25 insertions(+), 22 deletions(-) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 6750480..e7f64a7 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -27,7 +27,7 @@ class WithLexer(Serialize): return inst def _serialize(self, data, memo): - data['parser'] = data['parser'].serialize() + data['parser'] = data['parser'].serialize(memo) def init_traditional_lexer(self, lexer_conf): self.lexer_conf = lexer_conf diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index e34b8c3..cceaa45 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -13,6 +13,7 @@ from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator from ..exceptions import GrammarError from .grammar_analysis import GrammarAnalyzer, Terminal +from ..grammar import Rule ###{standalone @@ -33,19 +34,18 @@ class ParseTable: self.start_state = start_state self.end_state = end_state - def serialize(self): + def serialize(self, memo): tokens = Enumerator() rules = Enumerator() states = { - state: {tokens.get(token): ((1, rules.get(arg)) if action is Reduce else (0, arg)) + state: {tokens.get(token): ((1, arg.serialize(memo)) if action is Reduce else (0, arg)) for token, (action, arg) in actions.items()} for state, actions in self.states.items() } return { 'tokens': tokens.reversed(), - 'rules': {idx: r.serialize() for idx, r in rules.reversed().items()}, 'states': states, 'start_state': self.start_state, 'end_state': self.end_state, @@ -54,9 +54,8 @@ class ParseTable: @classmethod def deserialize(cls, data, memo): tokens = data['tokens'] - rules = data['rules'] states = { - state: {tokens[token]: ((Reduce, rules[arg]) if action==1 else (Shift, arg)) + state: {tokens[token]: ((Reduce, Rule.deserialize(arg, memo)) if action==1 else (Shift, arg)) for token, (action, arg) in actions.items()} for state, actions in data['states'].items() } diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 241a47e..5510e3d 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -28,8 +28,8 @@ class LALR_Parser(object): inst.parser = _Parser(IntParseTable.deserialize(data, memo), callbacks) return inst - def serialize(self): - return self._parse_table.serialize() + def serialize(self, memo): + return self._parse_table.serialize(memo) def parse(self, *args): return self.parser.parse(*args) diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 99e1929..3452a83 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -49,7 +49,7 @@ from lark import Lark from lark.parsers.lalr_analysis import Reduce -from lark.grammar import RuleOptions +from lark.grammar import RuleOptions, Rule from lark.lexer import TerminalDef _dir = path.dirname(__file__) @@ -63,13 +63,13 @@ EXTRACT_STANDALONE_FILES = [ 'tree.py', 'visitors.py', 'indenter.py', + 'grammar.py', 'lexer.py', 'parse_tree_builder.py', 'parsers/lalr_parser.py', 'parsers/lalr_analysis.py', 'parser_frontends.py', 'lark.py', - 'grammar.py', ] def extract_sections(lines): @@ -101,7 +101,7 @@ def main(fobj, start): with open(os.path.join(_larkdir, pyfile)) as f: print (extract_sections(f)['standalone']) - data, m = lark_inst.memo_serialize([TerminalDef]) + data, m = lark_inst.memo_serialize([TerminalDef, Rule]) print( 'DATA = (' ) # pprint(data, width=160) print(data) @@ -113,10 +113,9 @@ def main(fobj, start): print('Shift = 0') print('Reduce = 1') - print("def load_parser():") - print(" return Lark.deserialize(DATA)") - - + print("def Lark_StandAlone():") + print(" memo = SerializeMemoizer.deserialize(MEMO, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})") + print(" return Lark.deserialize(DATA, memo)") diff --git a/lark/utils.py b/lark/utils.py index 0849745..374c293 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -42,14 +42,11 @@ def bfs(initial, expand): -###{standalone -import sys, re - -Py36 = (sys.version_info[:2] >= (3, 6)) - - def _serialize(value, memo): + # if memo and memo.in_types(value): + # return {'__memo__': memo.memoized.get(value)} + if isinstance(value, Serialize): return value.serialize(memo) elif isinstance(value, list): @@ -60,11 +57,14 @@ def _serialize(value, memo): return {key:_serialize(elem, memo) for key, elem in value.items()} return value +###{standalone def _deserialize(data, namespace, memo): if isinstance(data, dict): if '__type__' in data: # Object class_ = namespace[data['__type__']] return class_.deserialize(data, memo) + elif '__memo__' in data: + return memo[data['__memo__']] return {key:_deserialize(value, namespace, memo) for key, value in data.items()} elif isinstance(data, list): return [_deserialize(value, namespace, memo) for value in data] @@ -159,6 +159,11 @@ def smart_decorator(f, create_decorator): else: return create_decorator(f.__func__.__call__, True) +import sys, re +Py36 = (sys.version_info[:2] >= (3, 6)) +###} + + def dedup_list(l): """Given a list (l) will removing duplicates from the list, preserving the original order of the list. Assumes that @@ -166,7 +171,7 @@ def dedup_list(l): dedup = set() return [ x for x in l if not (x in dedup or dedup.add(x))] -###} + try: From 6efa6b4fa0faf9eef2705ea4d6f7ccdbd051d07a Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 9 Apr 2019 12:05:13 +0300 Subject: [PATCH 10/14] Still working --- lark/parser_frontends.py | 72 ++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index e7f64a7..afa7de4 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -11,6 +11,40 @@ from .tree import Tree ###{standalone +def get_frontend(parser, lexer): + if parser=='lalr': + if lexer is None: + raise ValueError('The LALR parser requires use of a lexer') + elif lexer == 'standard': + return LALR_TraditionalLexer + elif lexer == 'contextual': + return LALR_ContextualLexer + elif issubclass(lexer, Lexer): + return partial(LALR_CustomLexer, lexer) + else: + raise ValueError('Unknown lexer: %s' % lexer) + elif parser=='earley': + if lexer=='standard': + return Earley + elif lexer=='dynamic': + return XEarley + elif lexer=='dynamic_complete': + return XEarley_CompleteLex + elif lexer=='contextual': + raise ValueError('The Earley parser does not support the contextual parser') + else: + raise ValueError('Unknown lexer: %s' % lexer) + elif parser == 'cyk': + if lexer == 'standard': + return CYK + else: + raise ValueError('CYK parser requires using standard parser.') + else: + raise ValueError('Unknown parser: %s' % parser) + + + + class WithLexer(Serialize): lexer = None parser = None @@ -66,6 +100,8 @@ class LALR_ContextualLexer(WithLexer): self.parser = LALR_Parser(parser_conf, debug=debug) self.init_contextual_lexer(lexer_conf) +###} + class LALR_CustomLexer(WithLexer): def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): self.parser = LALR_Parser(parser_conf) @@ -159,39 +195,3 @@ class CYK(WithLexer): def _apply_callback(self, tree): return self.callbacks[tree.rule](tree.children) - -def get_frontend(parser, lexer): - if parser=='lalr': - if lexer is None: - raise ValueError('The LALR parser requires use of a lexer') - elif lexer == 'standard': - return LALR_TraditionalLexer - elif lexer == 'contextual': - return LALR_ContextualLexer - elif issubclass(lexer, Lexer): - return partial(LALR_CustomLexer, lexer) - else: - raise ValueError('Unknown lexer: %s' % lexer) - elif parser=='earley': - if lexer=='standard': - return Earley - elif lexer=='dynamic': - return XEarley - elif lexer=='dynamic_complete': - return XEarley_CompleteLex - elif lexer=='contextual': - raise ValueError('The Earley parser does not support the contextual parser') - else: - raise ValueError('Unknown lexer: %s' % lexer) - elif parser == 'cyk': - if lexer == 'standard': - return CYK - else: - raise ValueError('CYK parser requires using standard parser.') - else: - raise ValueError('Unknown parser: %s' % parser) - - - - -###} \ No newline at end of file From ae51402cc74aa94f3e3be4275f2218a3d7a8391b Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 9 Apr 2019 13:48:43 +0300 Subject: [PATCH 11/14] Added serializer test --- lark/lark.py | 6 ++++-- lark/parser_frontends.py | 2 +- lark/tools/standalone.py | 4 ++-- tests/test_parser.py | 19 +++++++++++++++++++ 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index 1309c60..7cf5152 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -5,7 +5,7 @@ import time from collections import defaultdict from io import open -from .utils import STRING_TYPE, Serialize +from .utils import STRING_TYPE, Serialize, SerializeMemoizer from .load_grammar import load_grammar from .tree import Tree from .common import LexerConf, ParserConf @@ -241,7 +241,9 @@ class Lark(Serialize): return self.parser_class(self.lexer_conf, parser_conf, options=self.options) @classmethod - def deserialize(cls, data, memo): + def deserialize(cls, data, namespace, memo): + if memo: + memo = SerializeMemoizer.deserialize(memo, namespace, {}) inst = cls.__new__(cls) inst.options = LarkOptions.deserialize(data['options'], memo) inst.rules = [Rule.deserialize(r, memo) for r in data['rules']] diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index afa7de4..090f532 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -51,7 +51,7 @@ class WithLexer(Serialize): lexer_conf = None __serialize_fields__ = 'parser', 'lexer' - __serialize_namespace__ = Rule, ContextualLexer + __serialize_namespace__ = Rule, ContextualLexer, TraditionalLexer @classmethod def deserialize(cls, data, memo, callbacks): diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 3452a83..3a2118d 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -114,8 +114,8 @@ def main(fobj, start): print('Shift = 0') print('Reduce = 1') print("def Lark_StandAlone():") - print(" memo = SerializeMemoizer.deserialize(MEMO, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})") - print(" return Lark.deserialize(DATA, memo)") + print(" namespace = {'Rule': Rule, 'TerminalDef': TerminalDef}") + print(" return Lark.deserialize(DATA, namespace, MEMO)") diff --git a/tests/test_parser.py b/tests/test_parser.py index 7cd7dd8..92cda02 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -21,6 +21,8 @@ from lark.lark import Lark from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters from lark.tree import Tree from lark.visitors import Transformer +from lark.grammar import Rule +from lark.lexer import TerminalDef __path__ = os.path.dirname(__file__) def _read(n, *args): @@ -1429,6 +1431,23 @@ def _make_parser_test(LEXER, PARSER): parser.parse(r'"That" "And a \"b"') + @unittest.skipIf(PARSER!='lalr', "Serialize currently only works for LALR parsers (though it should be easy to extend)") + def test_serialize(self): + grammar = """ + start: "A" b "C" + b: "B" + """ + parser = _Lark(grammar) + d = parser.serialize() + parser2 = Lark.deserialize(d, {}, {}) + self.assertEqual(parser2.parse('ABC'), Tree('start', [Tree('b', [])]) ) + + namespace = {'Rule': Rule, 'TerminalDef': TerminalDef} + d, m = parser.memo_serialize(namespace.values()) + parser3 = Lark.deserialize(d, namespace, m) + self.assertEqual(parser3.parse('ABC'), Tree('start', [Tree('b', [])]) ) + + _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() _TestParser.__name__ = _NAME From 5ab12b031c7ec831a666a8a37326841ed7b9ca45 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 9 Apr 2019 14:08:08 +0300 Subject: [PATCH 12/14] Added transformer, postlex arguments to standalone --- lark/lark.py | 9 ++++++--- lark/parser_frontends.py | 4 ++-- lark/tools/standalone.py | 4 ++-- tests/test_tools.py | 4 ++++ 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index 7cf5152..daeed31 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -241,14 +241,17 @@ class Lark(Serialize): return self.parser_class(self.lexer_conf, parser_conf, options=self.options) @classmethod - def deserialize(cls, data, namespace, memo): + def deserialize(cls, data, namespace, memo, transformer=None, postlex=None): if memo: memo = SerializeMemoizer.deserialize(memo, namespace, {}) inst = cls.__new__(cls) - inst.options = LarkOptions.deserialize(data['options'], memo) + options = dict(data['options']) + options['transformer'] = transformer + options['postlex'] = postlex + inst.options = LarkOptions.deserialize(options, memo) inst.rules = [Rule.deserialize(r, memo) for r in data['rules']] inst._prepare_callbacks() - inst.parser = inst.parser_class.deserialize(data['parser'], memo, inst._callbacks) + inst.parser = inst.parser_class.deserialize(data['parser'], memo, inst._callbacks, inst.options.postlex) return inst diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 090f532..73c4611 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -54,9 +54,9 @@ class WithLexer(Serialize): __serialize_namespace__ = Rule, ContextualLexer, TraditionalLexer @classmethod - def deserialize(cls, data, memo, callbacks): + def deserialize(cls, data, memo, callbacks, postlex): inst = super(WithLexer, cls).deserialize(data, memo) - inst.postlex = None # TODO + inst.postlex = postlex inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) return inst diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 3a2118d..ab334b5 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -113,9 +113,9 @@ def main(fobj, start): print('Shift = 0') print('Reduce = 1') - print("def Lark_StandAlone():") + print("def Lark_StandAlone(transformer=None, postlex=None):") print(" namespace = {'Rule': Rule, 'TerminalDef': TerminalDef}") - print(" return Lark.deserialize(DATA, namespace, MEMO)") + print(" return Lark.deserialize(DATA, namespace, MEMO, transformer=transformer, postlex=postlex)") diff --git a/tests/test_tools.py b/tests/test_tools.py index 5965788..e1c49c4 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -70,6 +70,10 @@ class TestStandalone(TestCase): x = T().transform(x) self.assertEqual(x, ['a', 'b']) + l2 = _Lark(transformer=T()) + x = l2.parse('ABAB') + self.assertEqual(x, ['a', 'b']) + if __name__ == '__main__': unittest.main() From 65bde7e15f4341949e6ec22cbf2aa30190baa948 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 9 Apr 2019 14:16:37 +0300 Subject: [PATCH 13/14] @ instead of __memo__ --- lark/utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lark/utils.py b/lark/utils.py index 374c293..cf1042b 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -63,8 +63,8 @@ def _deserialize(data, namespace, memo): if '__type__' in data: # Object class_ = namespace[data['__type__']] return class_.deserialize(data, memo) - elif '__memo__' in data: - return memo[data['__memo__']] + elif '@' in data: + return memo[data['@']] return {key:_deserialize(value, namespace, memo) for key, value in data.items()} elif isinstance(data, list): return [_deserialize(value, namespace, memo) for value in data] @@ -78,7 +78,7 @@ class Serialize(object): def serialize(self, memo=None): if memo and memo.in_types(self): - return {'__memo__': memo.memoized.get(self)} + return {'@': memo.memoized.get(self)} fields = getattr(self, '__serialize_fields__') res = {f: _serialize(getattr(self, f), memo) for f in fields} @@ -95,8 +95,8 @@ class Serialize(object): fields = getattr(cls, '__serialize_fields__') - if '__memo__' in data: - return memo[data['__memo__']] + if '@' in data: + return memo[data['@']] inst = cls.__new__(cls) for f in fields: From d48e037ca736cca471e313b9676a7415aaa4f207 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 9 Apr 2019 14:25:44 +0300 Subject: [PATCH 14/14] Updated standalone example --- examples/standalone/create_standalone.sh | 2 +- examples/standalone/json_parser.py | 1827 +++++++++++++++++----- 2 files changed, 1444 insertions(+), 385 deletions(-) diff --git a/examples/standalone/create_standalone.sh b/examples/standalone/create_standalone.sh index a4fa879..141ab89 100755 --- a/examples/standalone/create_standalone.sh +++ b/examples/standalone/create_standalone.sh @@ -1 +1 @@ -python -m lark.tools.standalone json.lark > json_parser.py +PYTHONPATH=../.. python -m lark.tools.standalone json.lark > json_parser.py diff --git a/examples/standalone/json_parser.py b/examples/standalone/json_parser.py index 8c51baf..d424f1b 100644 --- a/examples/standalone/json_parser.py +++ b/examples/standalone/json_parser.py @@ -1,4 +1,4 @@ -# The file was automatically generated by Lark v0.5.5 +# The file was automatically generated by Lark v0.7.0 # # # Lark Stand-alone Generator Tool @@ -18,6 +18,9 @@ # If you wish to purchase a commercial license for this tool and its # generated code, contact me via email. # +# If GPL is incompatible with your free or open-source project, +# contact me and we'll work it out (for free). +# # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 2 of the License, or @@ -32,91 +35,218 @@ # # +class LarkError(Exception): + pass -import types -import functools -from contextlib import contextmanager +class GrammarError(LarkError): + pass -Str = type(u'') +class ParseError(LarkError): + pass -def inline_args(f): - # print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType) - if isinstance(f, types.FunctionType): - @functools.wraps(f) - def _f_func(self, args): - return f(self, *args) - return _f_func - elif isinstance(f, (type, types.BuiltinFunctionType)): - @functools.wraps(f) - def _f_builtin(_self, args): - return f(*args) - return _f_builtin - elif isinstance(f, types.MethodType): - @functools.wraps(f.__func__) - def _f(self, args): - return f.__func__(self, *args) - return _f - else: - @functools.wraps(f.__call__.__func__) - def _f(self, args): - return f.__call__.__func__(self, *args) - return _f +class LexError(LarkError): + pass +class UnexpectedInput(LarkError): + pos_in_stream = None + + def get_context(self, text, span=40): + pos = self.pos_in_stream + start = max(pos - span, 0) + end = pos + span + before = text[start:pos].rsplit('\n', 1)[-1] + after = text[pos:end].split('\n', 1)[0] + return before + after + '\n' + ' ' * len(before) + '^\n' + + def match_examples(self, parse_fn, examples): + """ Given a parser instance and a dictionary mapping some label with + some malformed syntax examples, it'll return the label for the + example that bests matches the current error. + """ + assert self.state is not None, "Not supported for this exception" -try: - from contextlib import suppress # Python 3 -except ImportError: - @contextmanager - def suppress(*excs): - '''Catch and dismiss the provided exception - - >>> x = 'hello' - >>> with suppress(IndexError): - ... x = x[10] - >>> x - 'hello' - ''' - try: - yield - except excs: - pass + candidate = None + for label, example in examples.items(): + assert not isinstance(example, STRING_TYPE) + for malformed in example: + try: + parse_fn(malformed) + except UnexpectedInput as ut: + if ut.state == self.state: + try: + if ut.token == self.token: # Try exact match first + return label + except AttributeError: + pass + if not candidate: + candidate = label -def is_terminal(sym): - return sym.isupper() + return candidate -class GrammarError(Exception): - pass -class ParseError(Exception): - pass +class UnexpectedCharacters(LexError, UnexpectedInput): + def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None): + message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) + + self.line = line + self.column = column + self.allowed = allowed + self.considered_tokens = considered_tokens + self.pos_in_stream = lex_pos + self.state = state -class UnexpectedToken(ParseError): - def __init__(self, token, expected, seq, index, considered_rules=None): + message += '\n\n' + self.get_context(seq) + if allowed: + message += '\nExpecting: %s\n' % allowed + + super(UnexpectedCharacters, self).__init__(message) + + + +class UnexpectedToken(ParseError, UnexpectedInput): + def __init__(self, token, expected, considered_rules=None, state=None): self.token = token - self.expected = expected + self.expected = expected # XXX str shouldn't necessary self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') self.considered_rules = considered_rules + self.state = state + self.pos_in_stream = getattr(token, 'pos_in_stream', None) - try: - context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) - except AttributeError: - context = seq[index:index+5] - except TypeError: - context = "" message = ("Unexpected token %r at line %s, column %s.\n" - "Expected: %s\n" - "Context: %s" % (token, self.line, self.column, expected, context)) + "Expected one of: \n\t* %s\n" + % (token, self.line, self.column, '\n\t* '.join(self.expected))) super(UnexpectedToken, self).__init__(message) +class VisitError(LarkError): + def __init__(self, tree, orig_exc): + self.tree = tree + self.orig_exc = orig_exc + + message = 'Error trying to process rule "%s":\n\n%s' % (tree.data, orig_exc) + super(VisitError, self).__init__(message) + +def _deserialize(data, namespace, memo): + if isinstance(data, dict): + if '__type__' in data: # Object + class_ = namespace[data['__type__']] + return class_.deserialize(data, memo) + elif '@' in data: + return memo[data['@']] + return {key:_deserialize(value, namespace, memo) for key, value in data.items()} + elif isinstance(data, list): + return [_deserialize(value, namespace, memo) for value in data] + return data + + +class Serialize(object): + def memo_serialize(self, types_to_memoize): + memo = SerializeMemoizer(types_to_memoize) + return self.serialize(memo), memo.serialize() + + def serialize(self, memo=None): + if memo and memo.in_types(self): + return {'@': memo.memoized.get(self)} + + fields = getattr(self, '__serialize_fields__') + res = {f: _serialize(getattr(self, f), memo) for f in fields} + res['__type__'] = type(self).__name__ + postprocess = getattr(self, '_serialize', None) + if postprocess: + postprocess(res, memo) + return res + + @classmethod + def deserialize(cls, data, memo): + namespace = getattr(cls, '__serialize_namespace__', {}) + namespace = {c.__name__:c for c in namespace} + + fields = getattr(cls, '__serialize_fields__') + + if '@' in data: + return memo[data['@']] + + inst = cls.__new__(cls) + for f in fields: + setattr(inst, f, _deserialize(data[f], namespace, memo)) + postprocess = getattr(inst, '_deserialize', None) + if postprocess: + postprocess() + return inst + + +class SerializeMemoizer(Serialize): + __serialize_fields__ = 'memoized', + + def __init__(self, types_to_memoize): + self.types_to_memoize = tuple(types_to_memoize) + self.memoized = Enumerator() + + def in_types(self, value): + return isinstance(value, self.types_to_memoize) + + def serialize(self): + return _serialize(self.memoized.reversed(), None) + + @classmethod + def deserialize(cls, data, namespace, memo): + return _deserialize(data, namespace, memo) + + + +try: + STRING_TYPE = basestring +except NameError: # Python 3 + STRING_TYPE = str + + +import types +from functools import wraps, partial +from contextlib import contextmanager + +Str = type(u'') +try: + classtype = types.ClassType # Python2 +except AttributeError: + classtype = type # Python3 + +def smart_decorator(f, create_decorator): + if isinstance(f, types.FunctionType): + return wraps(f)(create_decorator(f, True)) + + elif isinstance(f, (classtype, type, types.BuiltinFunctionType)): + return wraps(f)(create_decorator(f, False)) + + elif isinstance(f, types.MethodType): + return wraps(f)(create_decorator(f.__func__, True)) + + elif isinstance(f, partial): + # wraps does not work for partials in 2.7: https://bugs.python.org/issue3445 + return create_decorator(f.__func__, True) + + else: + return create_decorator(f.__func__.__call__, True) + +import sys, re +Py36 = (sys.version_info[:2] >= (3, 6)) +class Meta: + def __init__(self): + self.empty = True class Tree(object): - def __init__(self, data, children): + def __init__(self, data, children, meta=None): self.data = data self.children = children + self._meta = meta + + @property + def meta(self): + if self._meta is None: + self._meta = Meta() + return self._meta def __repr__(self): return 'Tree(%s, %s)' % (self.data, self.children) @@ -139,33 +269,111 @@ class Tree(object): def pretty(self, indent_str=' '): return ''.join(self._pretty(0, indent_str)) -class Transformer(object): - def _get_func(self, name): - return getattr(self, name) - def transform(self, tree): - items = [] - for c in tree.children: - try: - items.append(self.transform(c) if isinstance(c, Tree) else c) - except Discard: - pass + def __eq__(self, other): + try: + return self.data == other.data and self.children == other.children + except AttributeError: + return False + + def __ne__(self, other): + return not (self == other) + + def __hash__(self): + return hash((self.data, tuple(self.children))) + +from inspect import getmembers, getmro + +class Discard(Exception): + pass + +# Transformers + +class Transformer: + """Visits the tree recursively, starting with the leaves and finally the root (bottom-up) + + Calls its methods (provided by user via inheritance) according to tree.data + The returned value replaces the old one in the structure. + + Can be used to implement map or reduce. + """ + + def _call_userfunc(self, tree, new_children=None): + # Assumes tree is already transformed + children = new_children if new_children is not None else tree.children try: - f = self._get_func(tree.data) + f = getattr(self, tree.data) except AttributeError: - return self.__default__(tree.data, items) + return self.__default__(tree.data, children, tree.meta) else: - return f(items) + try: + if getattr(f, 'meta', False): + return f(children, tree.meta) + elif getattr(f, 'inline', False): + return f(*children) + elif getattr(f, 'whole_tree', False): + if new_children is not None: + raise NotImplementedError("Doesn't work with the base Transformer class") + return f(tree) + else: + return f(children) + except (GrammarError, Discard): + raise + except Exception as e: + raise VisitError(tree, e) + + def _transform_children(self, children): + for c in children: + try: + yield self._transform_tree(c) if isinstance(c, Tree) else c + except Discard: + pass - def __default__(self, data, children): - return Tree(data, children) + def _transform_tree(self, tree): + children = list(self._transform_children(tree.children)) + return self._call_userfunc(tree, children) + + def transform(self, tree): + return self._transform_tree(tree) def __mul__(self, other): return TransformerChain(self, other) + def __default__(self, data, children, meta): + "Default operation on tree (for override)" + return Tree(data, children, meta) + + @classmethod + def _apply_decorator(cls, decorator, **kwargs): + mro = getmro(cls) + assert mro[0] is cls + libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)} + for name, value in getmembers(cls): + if name.startswith('_') or name in libmembers: + continue + if not callable(cls.__dict__[name]): + continue + + # Skip if v_args already applied (at the function level) + if hasattr(cls.__dict__[name], 'vargs_applied'): + continue + + static = isinstance(cls.__dict__[name], (staticmethod, classmethod)) + setattr(cls, name, decorator(value, static=static, **kwargs)) + return cls + + +class InlineTransformer(Transformer): # XXX Deprecated + def _call_userfunc(self, tree, new_children=None): + # Assumes tree is already transformed + children = new_children if new_children is not None else tree.children + try: + f = getattr(self, tree.data) + except AttributeError: + return self.__default__(tree.data, children, tree.meta) + else: + return f(*children) -class Discard(Exception): - pass class TransformerChain(object): def __init__(self, *transformers): @@ -180,13 +388,57 @@ class TransformerChain(object): return TransformerChain(*self.transformers + (other,)) +class Transformer_InPlace(Transformer): + "Non-recursive. Changes the tree in-place instead of returning new instances" + def _transform_tree(self, tree): # Cancel recursion + return self._call_userfunc(tree) + + def transform(self, tree): + for subtree in tree.iter_subtrees(): + subtree.children = list(self._transform_children(subtree.children)) + + return self._transform_tree(tree) + + +class Transformer_InPlaceRecursive(Transformer): + "Recursive. Changes the tree in-place instead of returning new instances" + def _transform_tree(self, tree): + tree.children = list(self._transform_children(tree.children)) + return self._call_userfunc(tree) + + + +# Visitors + +class VisitorBase: + def _call_userfunc(self, tree): + return getattr(self, tree.data, self.__default__)(tree) + + def __default__(self, tree): + "Default operation on tree (for override)" + return tree + + +class Visitor(VisitorBase): + """Bottom-up visitor, non-recursive + + Visits the tree, starting with the leaves and finally the root (bottom-up) + Calls its methods (provided by user via inheritance) according to tree.data + """ + + + def visit(self, tree): + for subtree in tree.iter_subtrees(): + self._call_userfunc(subtree) + return tree -class InlineTransformer(Transformer): - def _get_func(self, name): # use super()._get_func - return inline_args(getattr(self, name)).__get__(self) +class Visitor_Recursive(VisitorBase): + """Bottom-up visitor, recursive + Visits the tree, starting with the leaves and finally the root (bottom-up) + Calls its methods (provided by user via inheritance) according to tree.data + """ -class Visitor(object): def visit(self, tree): for child in tree.children: if isinstance(child, Tree): @@ -196,50 +448,109 @@ class Visitor(object): f(tree) return tree - def __default__(self, tree): - pass -class Visitor_NoRecurse(Visitor): +def visit_children_decor(func): + "See Interpreter" + @wraps(func) + def inner(cls, tree): + values = cls.visit_children(tree) + return func(cls, values) + return inner + + +class Interpreter: + """Top-down visitor, recursive + + Visits the tree, starting with the root and finally the leaves (top-down) + Calls its methods (provided by user via inheritance) according to tree.data + + Unlike Transformer and Visitor, the Interpreter doesn't automatically visit its sub-branches. + The user has to explicitly call visit_children, or use the @visit_children_decor + """ def visit(self, tree): - subtrees = list(tree.iter_subtrees()) + return getattr(self, tree.data)(tree) - for subtree in (subtrees): - getattr(self, subtree.data, self.__default__)(subtree) - return tree + def visit_children(self, tree): + return [self.visit(child) if isinstance(child, Tree) else child + for child in tree.children] + def __getattr__(self, name): + return self.__default__ -class Transformer_NoRecurse(Transformer): - def transform(self, tree): - subtrees = list(tree.iter_subtrees()) + def __default__(self, tree): + return self.visit_children(tree) - def _t(t): - # Assumes t is already transformed - try: - f = self._get_func(t.data) - except AttributeError: - return self.__default__(t) - else: - return f(t) - for subtree in subtrees: - children = [] - for c in subtree.children: - try: - children.append(_t(c) if isinstance(c, Tree) else c) - except Discard: - pass - subtree.children = children - return _t(tree) - def __default__(self, t): - return t +# Decorators + +def _apply_decorator(obj, decorator, **kwargs): + try: + _apply = obj._apply_decorator + except AttributeError: + return decorator(obj, **kwargs) + else: + return _apply(decorator, **kwargs) + + + +def _inline_args__func(func): + @wraps(func) + def create_decorator(_f, with_self): + if with_self: + def f(self, children): + return _f(self, *children) + else: + def f(self, children): + return _f(*children) + return f + + return smart_decorator(func, create_decorator) + + +def inline_args(obj): # XXX Deprecated + return _apply_decorator(obj, _inline_args__func) + + + +def _visitor_args_func_dec(func, inline=False, meta=False, whole_tree=False, static=False): + assert [whole_tree, meta, inline].count(True) <= 1 + def create_decorator(_f, with_self): + if with_self: + def f(self, *args, **kwargs): + return _f(self, *args, **kwargs) + else: + def f(self, *args, **kwargs): + return _f(*args, **kwargs) + return f + + if static: + f = wraps(func)(create_decorator(func, False)) + else: + f = smart_decorator(func, create_decorator) + f.vargs_applied = True + f.inline = inline + f.meta = meta + f.whole_tree = whole_tree + return f + +def v_args(inline=False, meta=False, tree=False): + "A convenience decorator factory, for modifying the behavior of user-supplied visitor methods" + if [tree, meta, inline].count(True) > 1: + raise ValueError("Visitor functions can either accept tree, or meta, or be inlined. These cannot be combined.") + def _visitor_args_dec(obj): + return _apply_decorator(obj, _visitor_args_func_dec, inline=inline, meta=meta, whole_tree=tree) + return _visitor_args_dec + + class Indenter: def __init__(self): - self.paren_level = 0 - self.indent_level = [0] + self.paren_level = None + self.indent_level = None + assert self.tab_len > 0 def handle_NL(self, token): if self.paren_level > 0: @@ -260,7 +571,7 @@ class Indenter: assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) - def process(self, stream): + def _process(self, stream): for token in stream: if token.type == self.NL_type: for t in self.handle_NL(token): @@ -280,43 +591,213 @@ class Indenter: assert self.indent_level == [0], self.indent_level + def process(self, stream): + self.paren_level = 0 + self.indent_level = [0] + return self._process(stream) + # XXX Hack for ContextualLexer. Maybe there's a more elegant solution? @property def always_accept(self): return (self.NL_type,) -class LexError(Exception): - pass -class UnexpectedInput(LexError): - def __init__(self, seq, lex_pos, line, column, allowed=None, considered_rules=None): - context = seq[lex_pos:lex_pos+5] - message = "No token defined for: '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column) - if allowed: - message += '\n\nExpecting: %s\n' % allowed +class Symbol(Serialize): + is_term = NotImplemented + + def __init__(self, name): + self.name = name + + def __eq__(self, other): + assert isinstance(other, Symbol), other + return self.is_term == other.is_term and self.name == other.name + + def __ne__(self, other): + return not (self == other) + + def __hash__(self): + return hash(self.name) + + def __repr__(self): + return '%s(%r)' % (type(self).__name__, self.name) + + fullrepr = property(__repr__) + + +class Terminal(Symbol): + __serialize_fields__ = 'name', 'filter_out' + + is_term = True + + def __init__(self, name, filter_out=False): + self.name = name + self.filter_out = filter_out + + @property + def fullrepr(self): + return '%s(%r, %r)' % (type(self).__name__, self.name, self.filter_out) + + + +class NonTerminal(Symbol): + __serialize_fields__ = 'name', + + is_term = False + + + +class RuleOptions(Serialize): + __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'empty_indices' + + def __init__(self, keep_all_tokens=False, expand1=False, priority=None, empty_indices=()): + self.keep_all_tokens = keep_all_tokens + self.expand1 = expand1 + self.priority = priority + self.empty_indices = empty_indices + + def __repr__(self): + return 'RuleOptions(%r, %r, %r)' % ( + self.keep_all_tokens, + self.expand1, + self.priority, + ) + + +class Rule(Serialize): + """ + origin : a symbol + expansion : a list of symbols + order : index of this expansion amongst all rules of the same name + """ + __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash') + + __serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options' + __serialize_namespace__ = Terminal, NonTerminal, RuleOptions + + def __init__(self, origin, expansion, order=0, alias=None, options=None): + self.origin = origin + self.expansion = expansion + self.alias = alias + self.order = order + self.options = options + self._hash = hash((self.origin, tuple(self.expansion))) + + def _deserialize(self): + self._hash = hash((self.origin, tuple(self.expansion))) + + def __str__(self): + return '<%s : %s>' % (self.origin.name, ' '.join(x.name for x in self.expansion)) + + def __repr__(self): + return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options) + + def __hash__(self): + return self._hash + + def __eq__(self, other): + if not isinstance(other, Rule): + return False + return self.origin == other.origin and self.expansion == other.expansion + + + + + +class Pattern(Serialize): + __serialize_fields__ = 'value', 'flags' + + def __init__(self, value, flags=()): + self.value = value + self.flags = frozenset(flags) + + def __repr__(self): + return repr(self.to_regexp()) + + # Pattern Hashing assumes all subclasses have a different priority! + def __hash__(self): + return hash((type(self), self.value, self.flags)) + def __eq__(self, other): + return type(self) == type(other) and self.value == other.value and self.flags == other.flags + + def to_regexp(self): + raise NotImplementedError() + + if Py36: + # Python 3.6 changed syntax for flags in regular expression + def _get_flags(self, value): + for f in self.flags: + value = ('(?%s:%s)' % (f, value)) + return value + + else: + def _get_flags(self, value): + for f in self.flags: + value = ('(?%s)' % f) + value + return value + + +class PatternStr(Pattern): + def to_regexp(self): + return self._get_flags(re.escape(self.value)) + + @property + def min_width(self): + return len(self.value) + max_width = min_width + +class PatternRE(Pattern): + def to_regexp(self): + return self._get_flags(self.value) + + @property + def min_width(self): + return get_regexp_width(self.to_regexp())[0] + @property + def max_width(self): + return get_regexp_width(self.to_regexp())[1] + + +class TerminalDef(Serialize): + __serialize_fields__ = 'name', 'pattern', 'priority' + __serialize_namespace__ = PatternStr, PatternRE + + def __init__(self, name, pattern, priority=1): + assert isinstance(pattern, Pattern), pattern + self.name = name + self.pattern = pattern + self.priority = priority + + def __repr__(self): + return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) - super(UnexpectedInput, self).__init__(message) - self.line = line - self.column = column - self.context = context - self.allowed = allowed - self.considered_rules = considered_rules class Token(Str): - def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None): - inst = Str.__new__(cls, value) - inst.type = type_ - inst.pos_in_stream = pos_in_stream - inst.value = value - inst.line = line - inst.column = column - return inst + __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column') + + def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None): + try: + self = super(Token, cls).__new__(cls, value) + except UnicodeDecodeError: + value = value.decode('latin1') + self = super(Token, cls).__new__(cls, value) + + self.type = type_ + self.pos_in_stream = pos_in_stream + self.value = value + self.line = line + self.column = column + self.end_line = end_line + self.end_column = end_column + return self @classmethod def new_borrow_pos(cls, type_, value, borrow_t): - return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column) + return cls(type_, value, borrow_t.pos_in_stream, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column) + + def __reduce__(self): + return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, )) def __repr__(self): return 'Token(%s, %r)' % (self.type, self.value) @@ -338,7 +819,7 @@ class LineCounter: self.newline_char = '\n' self.char_pos = 0 self.line = 1 - self.column = 0 + self.column = 1 self.line_start_pos = 0 def feed(self, token, test_newline=True): @@ -353,45 +834,51 @@ class LineCounter: self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 self.char_pos += len(token) - self.column = self.char_pos - self.line_start_pos + self.column = self.char_pos - self.line_start_pos + 1 class _Lex: "Built to serve both Lexer and ContextualLexer" - def __init__(self, lexer): + def __init__(self, lexer, state=None): self.lexer = lexer + self.state = state def lex(self, stream, newline_types, ignore_types): - newline_types = list(newline_types) - ignore_types = list(ignore_types) + newline_types = frozenset(newline_types) + ignore_types = frozenset(ignore_types) line_ctr = LineCounter() - t = None - while True: + while line_ctr.char_pos < len(stream): lexer = self.lexer for mre, type_from_index in lexer.mres: m = mre.match(stream, line_ctr.char_pos) - if m: - value = m.group(0) - type_ = type_from_index[m.lastindex] - if type_ not in ignore_types: + if not m: + continue + + t = None + value = m.group(0) + type_ = type_from_index[m.lastindex] + if type_ not in ignore_types: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + if t.type in lexer.callback: + t = lexer.callback[t.type](t) + if not isinstance(t, Token): + raise ValueError("Callbacks must return a token (returned %r)" % t) + yield t + else: + if type_ in lexer.callback: t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - if t.type in lexer.callback: - t = lexer.callback[t.type](t) - yield t - else: - if type_ in lexer.callback: - t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - lexer.callback[type_](t) - - line_ctr.feed(value, type_ in newline_types) - if t: - t.end_line = line_ctr.line - t.end_column = line_ctr.column - break - else: - if line_ctr.char_pos < len(stream): - raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) + lexer.callback[type_](t) + + line_ctr.feed(value, type_ in newline_types) + if t: + t.end_line = line_ctr.line + t.end_column = line_ctr.column + break + else: + allowed = [v for m, tfi in lexer.mres for v in tfi.values()] + raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state) + class UnlessCallback: def __init__(self, mres): @@ -401,14 +888,183 @@ class UnlessCallback: for mre, type_from_index in self.mres: m = mre.match(t.value) if m: - value = m.group(0) t.type = type_from_index[m.lastindex] break return t +class CallChain: + def __init__(self, callback1, callback2, cond): + self.callback1 = callback1 + self.callback2 = callback2 + self.cond = cond + + def __call__(self, t): + t2 = self.callback1(t) + return self.callback2(t) if self.cond(t2) else t2 + + + + + +def _create_unless(terminals): + tokens_by_type = classify(terminals, lambda t: type(t.pattern)) + assert len(tokens_by_type) <= 2, tokens_by_type.keys() + embedded_strs = set() + callback = {} + for retok in tokens_by_type.get(PatternRE, []): + unless = [] # {} + for strtok in tokens_by_type.get(PatternStr, []): + if strtok.priority > retok.priority: + continue + s = strtok.pattern.value + m = re.match(retok.pattern.to_regexp(), s) + if m and m.group(0) == s: + unless.append(strtok) + if strtok.pattern.flags <= retok.pattern.flags: + embedded_strs.add(strtok) + if unless: + callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True)) + + terminals = [t for t in terminals if t not in embedded_strs] + return terminals, callback + + +def _build_mres(terminals, max_size, match_whole): + # Python sets an unreasonable group limit (currently 100) in its re module + # Worse, the only way to know we reached it is by catching an AssertionError! + # This function recursively tries less and less groups until it's successful. + postfix = '$' if match_whole else '' + mres = [] + while terminals: + try: + mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size])) + except AssertionError: # Yes, this is what Python provides us.. :/ + return _build_mres(terminals, max_size//2, match_whole) + + # terms_from_name = {t.name: t for t in terminals[:max_size]} + mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) + terminals = terminals[max_size:] + return mres + +def build_mres(terminals, match_whole=False): + return _build_mres(terminals, len(terminals), match_whole) + +def _regexp_has_newline(r): + """Expressions that may indicate newlines in a regexp: + - newlines (\n) + - escaped newline (\\n) + - anything but ([^...]) + - any-char (.) when the flag (?s) exists + """ + return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r) + +class Lexer(Serialize): + """Lexer interface + + Method Signatures: + lex(self, stream) -> Iterator[Token] + + set_parser_state(self, state) # Optional + """ + set_parser_state = NotImplemented + lex = NotImplemented + + +class TraditionalLexer(Lexer): + __serialize_fields__ = 'terminals', 'ignore_types', 'newline_types' + __serialize_namespace__ = TerminalDef, + + def _deserialize(self): + self.mres = build_mres(self.terminals) + self.callback = {} # TODO implement + + + def __init__(self, terminals, ignore=(), user_callbacks={}): + assert all(isinstance(t, TerminalDef) for t in terminals), terminals + + terminals = list(terminals) + + # Sanitization + for t in terminals: + try: + re.compile(t.pattern.to_regexp()) + except: + raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) + + if t.pattern.min_width == 0: + raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern)) + + assert set(ignore) <= {t.name for t in terminals} + + # Init + self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())] + self.ignore_types = list(ignore) + + terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) + + terminals, self.callback = _create_unless(terminals) + assert all(self.callback.values()) + + for type_, f in user_callbacks.items(): + if type_ in self.callback: + # Already a callback there, probably UnlessCallback + self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_) + else: + self.callback[type_] = f + + self.terminals = terminals + + self.mres = build_mres(terminals) + + + def lex(self, stream): + return _Lex(self).lex(stream, self.newline_types, self.ignore_types) + + + +class ContextualLexer(Lexer): + __serialize_fields__ = 'root_lexer', 'lexers' + __serialize_namespace__ = TraditionalLexer, + + def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): + tokens_by_name = {} + for t in terminals: + assert t.name not in tokens_by_name, t + tokens_by_name[t.name] = t + + lexer_by_tokens = {} + self.lexers = {} + for state, accepts in states.items(): + key = frozenset(accepts) + try: + lexer = lexer_by_tokens[key] + except KeyError: + accepts = set(accepts) | set(ignore) | set(always_accept) + state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] + lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) + lexer_by_tokens[key] = lexer + + self.lexers[state] = lexer + + self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks) + + self.set_parser_state(None) # Needs to be set on the outside + + def set_parser_state(self, state): + self.parser_state = state + + def lex(self, stream): + l = _Lex(self.lexers[self.parser_state], self.parser_state) + for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): + yield x + l.lexer = self.lexers[self.parser_state] + l.state = self.parser_state + + +from functools import partial, wraps +from itertools import repeat, product -from functools import partial class ExpandSingleChild: def __init__(self, node_builder): @@ -420,18 +1076,6 @@ class ExpandSingleChild: else: return self.node_builder(children) - -class CreateToken: - "Used for fixing the results of scanless parsing" - - def __init__(self, token_name, node_builder): - self.node_builder = node_builder - self.token_name = token_name - - def __call__(self, children): - return self.node_builder( [Token(self.token_name, ''.join(children))] ) - - class PropagatePositions: def __init__(self, node_builder): self.node_builder = node_builder @@ -439,23 +1083,83 @@ class PropagatePositions: def __call__(self, children): res = self.node_builder(children) - if children: - for a in children: - with suppress(AttributeError): - res.line = a.line - res.column = a.column - break + if isinstance(res, Tree): + for c in children: + if isinstance(c, Tree) and c.children and not c.meta.empty: + res.meta.line = c.meta.line + res.meta.column = c.meta.column + res.meta.start_pos = c.meta.start_pos + res.meta.empty = False + break + elif isinstance(c, Token): + res.meta.line = c.line + res.meta.column = c.column + res.meta.start_pos = c.pos_in_stream + res.meta.empty = False + break - for a in reversed(children): - with suppress(AttributeError): - res.end_line = a.end_line - res.end_column = a.end_column - break + for c in reversed(children): + if isinstance(c, Tree) and c.children and not c.meta.empty: + res.meta.end_line = c.meta.end_line + res.meta.end_column = c.meta.end_column + res.meta.end_pos = c.meta.end_pos + res.meta.empty = False + break + elif isinstance(c, Token): + res.meta.end_line = c.end_line + res.meta.end_column = c.end_column + res.meta.end_pos = c.pos_in_stream + len(c.value) + res.meta.empty = False + break return res class ChildFilter: + def __init__(self, to_include, append_none, node_builder): + self.node_builder = node_builder + self.to_include = to_include + self.append_none = append_none + + def __call__(self, children): + filtered = [] + + for i, to_expand, add_none in self.to_include: + if add_none: + filtered += [None] * add_none + if to_expand: + filtered += children[i].children + else: + filtered.append(children[i]) + + if self.append_none: + filtered += [None] * self.append_none + + return self.node_builder(filtered) + +class ChildFilterLALR(ChildFilter): + "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)" + + def __call__(self, children): + filtered = [] + for i, to_expand, add_none in self.to_include: + if add_none: + filtered += [None] * add_none + if to_expand: + if filtered: + filtered += children[i].children + else: # Optimize for left-recursion + filtered = children[i].children + else: + filtered.append(children[i]) + + if self.append_none: + filtered += [None] * self.append_none + + return self.node_builder(filtered) + +class ChildFilterLALR_NoPlaceholders(ChildFilter): + "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)" def __init__(self, to_include, node_builder): self.node_builder = node_builder self.to_include = to_include @@ -470,77 +1174,158 @@ class ChildFilter: filtered = children[i].children else: filtered.append(children[i]) - return self.node_builder(filtered) def _should_expand(sym): - return not is_terminal(sym) and sym.startswith('_') + return not sym.is_term and sym.name.startswith('_') + +def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous, _empty_indices): + # Prepare empty_indices as: How many Nones to insert at each index? + if _empty_indices: + assert _empty_indices.count(False) == len(expansion) + s = ''.join(str(int(b)) for b in _empty_indices) + empty_indices = [len(ones) for ones in s.split('0')] + assert len(empty_indices) == len(expansion)+1, (empty_indices, len(expansion)) + else: + empty_indices = [0] * (len(expansion)+1) -def maybe_create_child_filter(expansion, filter_out): - to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) if sym not in filter_out] + to_include = [] + nones_to_add = 0 + for i, sym in enumerate(expansion): + nones_to_add += empty_indices[i] + if keep_all_tokens or not (sym.is_term and sym.filter_out): + to_include.append((i, _should_expand(sym), nones_to_add)) + nones_to_add = 0 - if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): - return partial(ChildFilter, to_include) + nones_to_add += empty_indices[len(expansion)] + if _empty_indices or len(to_include) < len(expansion) or any(to_expand for i, to_expand,_ in to_include): + if _empty_indices or ambiguous: + return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include, nones_to_add) + else: + # LALR without placeholders + return partial(ChildFilterLALR_NoPlaceholders, [(i, x) for i,x,_ in to_include]) + +class AmbiguousExpander: + """Deal with the case where we're expanding children ('_rule') into a parent but the children + are ambiguous. i.e. (parent->_ambig->_expand_this_rule). In this case, make the parent itself + ambiguous with as many copies as their are ambiguous children, and then copy the ambiguous children + into the right parents in the right places, essentially shifting the ambiguiuty up the tree.""" + def __init__(self, to_expand, tree_class, node_builder): + self.node_builder = node_builder + self.tree_class = tree_class + self.to_expand = to_expand -class Callback(object): - pass + def __call__(self, children): + def _is_ambig_tree(child): + return hasattr(child, 'data') and child.data == '_ambig' + + #### When we're repeatedly expanding ambiguities we can end up with nested ambiguities. + # All children of an _ambig node should be a derivation of that ambig node, hence + # it is safe to assume that if we see an _ambig node nested within an ambig node + # it is safe to simply expand it into the parent _ambig node as an alternative derivation. + ambiguous = [] + for i, child in enumerate(children): + if _is_ambig_tree(child): + if i in self.to_expand: + ambiguous.append(i) + + to_expand = [j for j, grandchild in enumerate(child.children) if _is_ambig_tree(grandchild)] + child.expand_kids_by_index(*to_expand) + + if not ambiguous: + return self.node_builder(children) + + expand = [ iter(child.children) if i in ambiguous else repeat(child) for i, child in enumerate(children) ] + return self.tree_class('_ambig', [self.node_builder(list(f[0])) for f in product(zip(*expand))]) + +def maybe_create_ambiguous_expander(tree_class, expansion, keep_all_tokens): + to_expand = [i for i, sym in enumerate(expansion) + if keep_all_tokens or ((not (sym.is_term and sym.filter_out)) and _should_expand(sym))] + if to_expand: + return partial(AmbiguousExpander, to_expand, tree_class) + +def ptb_inline_args(func): + @wraps(func) + def f(children): + return func(*children) + return f class ParseTreeBuilder: - def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False): + def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False, maybe_placeholders=False): self.tree_class = tree_class self.propagate_positions = propagate_positions self.always_keep_all_tokens = keep_all_tokens + self.ambiguous = ambiguous + self.maybe_placeholders = maybe_placeholders self.rule_builders = list(self._init_builders(rules)) - self.user_aliases = {} - def _init_builders(self, rules): - filter_out = {rule.origin for rule in rules if rule.options and rule.options.filter_out} - filter_out |= {sym for rule in rules for sym in rule.expansion if is_terminal(sym) and sym.startswith('_')} - assert all(x.startswith('_') for x in filter_out) - for rule in rules: options = rule.options keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) expand_single_child = options.expand1 if options else False - create_token = options.create_token if options else False - wrapper_chain = filter(None, [ - create_token and partial(CreateToken, create_token), + wrapper_chain = list(filter(None, [ (expand_single_child and not rule.alias) and ExpandSingleChild, - maybe_create_child_filter(rule.expansion, () if keep_all_tokens else filter_out), + maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders and options else None), self.propagate_positions and PropagatePositions, - ]) + self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens), + ])) yield rule, wrapper_chain def create_callback(self, transformer=None): - callback = Callback() + callbacks = {} for rule, wrapper_chain in self.rule_builders: - internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion)) - user_callback_name = rule.alias or rule.origin + user_callback_name = rule.alias or rule.origin.name try: - f = transformer._get_func(user_callback_name) + f = getattr(transformer, user_callback_name) + assert not getattr(f, 'meta', False), "Meta args not supported for internal transformer" + # XXX InlineTransformer is deprecated! + if getattr(f, 'inline', False) or isinstance(transformer, InlineTransformer): + f = ptb_inline_args(f) except AttributeError: f = partial(self.tree_class, user_callback_name) - self.user_aliases[rule] = rule.alias - rule.alias = internal_callback_name - for w in wrapper_chain: f = w(f) - if hasattr(callback, internal_callback_name): + if rule in callbacks: raise GrammarError("Rule '%s' already exists" % (rule,)) - setattr(callback, internal_callback_name, f) - return callback + callbacks[rule] = f + + return callbacks + +class LALR_Parser(object): + def __init__(self, parser_conf, debug=False): + assert all(r.options is None or r.options.priority is None + for r in parser_conf.rules), "LALR doesn't yet support prioritization" + analysis = LALR_Analyzer(parser_conf, debug=debug) + analysis.compute_lookahead() + callbacks = parser_conf.callbacks + + self._parse_table = analysis.parse_table + self.parser_conf = parser_conf + self.parser = _Parser(analysis.parse_table, callbacks) + + @classmethod + def deserialize(cls, data, memo, callbacks): + inst = cls.__new__(cls) + inst.parser = _Parser(IntParseTable.deserialize(data, memo), callbacks) + return inst + + def serialize(self, memo): + return self._parse_table.serialize(memo) + + def parse(self, *args): + return self.parser.parse(*args) class _Parser: @@ -551,7 +1336,6 @@ class _Parser: self.callbacks = callbacks def parse(self, seq, set_state=None): - i = 0 token = None stream = iter(seq) states = self.states @@ -561,14 +1345,13 @@ class _Parser: if set_state: set_state(self.start_state) - def get_action(key): + def get_action(token): state = state_stack[-1] try: - return states[state][key] + return states[state][token.type] except KeyError: - expected = states[state].keys() - - raise UnexpectedToken(token, expected, seq, i) + expected = [s for s in states[state].keys() if s.isupper()] + raise UnexpectedToken(token, expected, state=state) def reduce(rule): size = len(rule.expansion) @@ -581,15 +1364,15 @@ class _Parser: value = self.callbacks[rule](s) - _action, new_state = get_action(rule.origin) + _action, new_state = states[state_stack[-1]][rule.origin.name] assert _action is Shift state_stack.append(new_state) value_stack.append(value) # Main LALR-parser loop - for i, token in enumerate(stream): + for token in stream: while True: - action, arg = get_action(token.type) + action, arg = get_action(token) assert arg != self.end_state if action is Shift: @@ -600,8 +1383,9 @@ class _Parser: else: reduce(arg) + token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) while True: - _action, arg = get_action('$END') + _action, arg = get_action(token) if _action is Shift: assert arg == self.end_state val ,= value_stack @@ -611,169 +1395,444 @@ class _Parser: -class Rule(object): +class Action: + def __init__(self, name): + self.name = name + def __str__(self): + return self.name + def __repr__(self): + return str(self) + +Shift = Action('Shift') +Reduce = Action('Reduce') + +class ParseTable: + def __init__(self, states, start_state, end_state): + self.states = states + self.start_state = start_state + self.end_state = end_state + + def serialize(self, memo): + tokens = Enumerator() + rules = Enumerator() + + states = { + state: {tokens.get(token): ((1, arg.serialize(memo)) if action is Reduce else (0, arg)) + for token, (action, arg) in actions.items()} + for state, actions in self.states.items() + } + + return { + 'tokens': tokens.reversed(), + 'states': states, + 'start_state': self.start_state, + 'end_state': self.end_state, + } + + @classmethod + def deserialize(cls, data, memo): + tokens = data['tokens'] + states = { + state: {tokens[token]: ((Reduce, Rule.deserialize(arg, memo)) if action==1 else (Shift, arg)) + for token, (action, arg) in actions.items()} + for state, actions in data['states'].items() + } + return cls(states, data['start_state'], data['end_state']) + + +class IntParseTable(ParseTable): + + @classmethod + def from_ParseTable(cls, parse_table): + enum = list(parse_table.states) + state_to_idx = {s:i for i,s in enumerate(enum)} + int_states = {} + + for s, la in parse_table.states.items(): + la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v + for k,v in la.items()} + int_states[ state_to_idx[s] ] = la + + + start_state = state_to_idx[parse_table.start_state] + end_state = state_to_idx[parse_table.end_state] + return cls(int_states, start_state, end_state) + + + +def get_frontend(parser, lexer): + if parser=='lalr': + if lexer is None: + raise ValueError('The LALR parser requires use of a lexer') + elif lexer == 'standard': + return LALR_TraditionalLexer + elif lexer == 'contextual': + return LALR_ContextualLexer + elif issubclass(lexer, Lexer): + return partial(LALR_CustomLexer, lexer) + else: + raise ValueError('Unknown lexer: %s' % lexer) + elif parser=='earley': + if lexer=='standard': + return Earley + elif lexer=='dynamic': + return XEarley + elif lexer=='dynamic_complete': + return XEarley_CompleteLex + elif lexer=='contextual': + raise ValueError('The Earley parser does not support the contextual parser') + else: + raise ValueError('Unknown lexer: %s' % lexer) + elif parser == 'cyk': + if lexer == 'standard': + return CYK + else: + raise ValueError('CYK parser requires using standard parser.') + else: + raise ValueError('Unknown parser: %s' % parser) + + + + +class WithLexer(Serialize): + lexer = None + parser = None + lexer_conf = None + + __serialize_fields__ = 'parser', 'lexer' + __serialize_namespace__ = Rule, ContextualLexer, TraditionalLexer + + @classmethod + def deserialize(cls, data, memo, callbacks, postlex): + inst = super(WithLexer, cls).deserialize(data, memo) + inst.postlex = postlex + inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) + return inst + + def _serialize(self, data, memo): + data['parser'] = data['parser'].serialize(memo) + + def init_traditional_lexer(self, lexer_conf): + self.lexer_conf = lexer_conf + self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) + self.postlex = lexer_conf.postlex + + def init_contextual_lexer(self, lexer_conf): + self.lexer_conf = lexer_conf + self.postlex = lexer_conf.postlex + states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} + always_accept = self.postlex.always_accept if self.postlex else () + self.lexer = ContextualLexer(lexer_conf.tokens, states, + ignore=lexer_conf.ignore, + always_accept=always_accept, + user_callbacks=lexer_conf.callbacks) + + def lex(self, text): + stream = self.lexer.lex(text) + return self.postlex.process(stream) if self.postlex else stream + + def parse(self, text): + token_stream = self.lex(text) + sps = self.lexer.set_parser_state + return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else []) + + +class LALR_TraditionalLexer(WithLexer): + def __init__(self, lexer_conf, parser_conf, options=None): + debug = options.debug if options else False + self.parser = LALR_Parser(parser_conf, debug=debug) + self.init_traditional_lexer(lexer_conf) + +class LALR_ContextualLexer(WithLexer): + def __init__(self, lexer_conf, parser_conf, options=None): + debug = options.debug if options else False + self.parser = LALR_Parser(parser_conf, debug=debug) + self.init_contextual_lexer(lexer_conf) + + + +class LarkOptions(Serialize): + """Specifies the options for Lark + """ - origin : a symbol - expansion : a list of symbols + OPTIONS_DOC = """ + parser - Decides which parser engine to use, "earley" or "lalr". (Default: "earley") + Note: "lalr" requires a lexer + + lexer - Decides whether or not to use a lexer stage + "standard": Use a standard lexer + "contextual": Stronger lexer (only works with parser="lalr") + "dynamic": Flexible and powerful (only with parser="earley") + "dynamic_complete": Same as dynamic, but tries *every* variation + of tokenizing possible. (only with parser="earley") + "auto" (default): Choose for me based on grammar and parser + + ambiguity - Decides how to handle ambiguity in the parse. Only relevant if parser="earley" + "resolve": The parser will automatically choose the simplest derivation + (it chooses consistently: greedy for tokens, non-greedy for rules) + "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). + + transformer - Applies the transformer to every parse tree + debug - Affects verbosity (default: False) + keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False) + cache_grammar - Cache the Lark grammar (Default: False) + postlex - Lexer post-processing (Default: None) Only works with the standard and contextual lexers. + start - The start symbol (Default: start) + profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False) + priority - How priorities should be evaluated - auto, none, normal, invert (Default: auto) + propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches. + lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. + maybe_placeholders - Experimental feature. Instead of omitting optional rules (i.e. rule?), replace them with None """ - def __init__(self, origin, expansion, alias=None, options=None): - self.origin = origin - self.expansion = expansion - self.alias = alias - self.options = options + if __doc__: + __doc__ += OPTIONS_DOC + + _defaults = { + 'debug': False, + 'keep_all_tokens': False, + 'tree_class': None, + 'cache_grammar': False, + 'postlex': None, + 'parser': 'earley', + 'lexer': 'auto', + 'transformer': None, + 'start': 'start', + 'profile': False, + 'priority': 'auto', + 'ambiguity': 'auto', + 'propagate_positions': False, + 'lexer_callbacks': {}, + 'maybe_placeholders': False, + } + + def __init__(self, options_dict): + o = dict(options_dict) + + options = {} + for name, default in self._defaults.items(): + if name in o: + value = o.pop(name) + if isinstance(default, bool): + value = bool(value) + else: + value = default - def __str__(self): - return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) + options[name] = value - def __repr__(self): - return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options) + self.__dict__['options'] = options + assert self.parser in ('earley', 'lalr', 'cyk', None) -class RuleOptions: - def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): - self.keep_all_tokens = keep_all_tokens - self.expand1 = expand1 - self.create_token = create_token # used for scanless postprocessing - self.priority = priority + if self.parser == 'earley' and self.transformer: + raise ValueError('Cannot specify an embedded transformer when using the Earley algorithm.' + 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)') + + if o: + raise ValueError("Unknown options: %s" % o.keys()) + + def __getattr__(self, name): + return self.options[name] + def __setattr__(self, name, value): + assert name in self.options + self.options[name] = value + + def serialize(self, memo): + return self.options + + @classmethod + def deserialize(cls, data, memo): + return cls(data) + + +class Profiler: + def __init__(self): + self.total_time = defaultdict(float) + self.cur_section = '__init__' + self.last_enter_time = time.time() + + def enter_section(self, name): + cur_time = time.time() + self.total_time[self.cur_section] += cur_time - self.last_enter_time + self.last_enter_time = cur_time + self.cur_section = name + + def make_wrapper(self, name, f): + def wrapper(*args, **kwargs): + last_section = self.cur_section + self.enter_section(name) + try: + return f(*args, **kwargs) + finally: + self.enter_section(last_section) + + return wrapper + + +class Lark(Serialize): + def __init__(self, grammar, **options): + """ + grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax) + options : a dictionary controlling various aspects of Lark. + """ + self.options = LarkOptions(options) + + # Some, but not all file-like objects have a 'name' attribute + try: + self.source = grammar.name + except AttributeError: + self.source = '' + + # Drain file-like objects to get their contents + try: + read = grammar.read + except AttributeError: + pass + else: + grammar = read() + + assert isinstance(grammar, STRING_TYPE) - self.filter_out = filter_out # remove this rule from the tree - # used for "token"-rules in scanless + if self.options.cache_grammar: + raise NotImplementedError("Not available yet") + + assert not self.options.profile, "Feature temporarily disabled" + # self.profiler = Profiler() if self.options.profile else None + + if self.options.lexer == 'auto': + if self.options.parser == 'lalr': + self.options.lexer = 'contextual' + elif self.options.parser == 'earley': + self.options.lexer = 'dynamic' + elif self.options.parser == 'cyk': + self.options.lexer = 'standard' + else: + assert False, self.options.parser + lexer = self.options.lexer + assert lexer in ('standard', 'contextual', 'dynamic', 'dynamic_complete') or issubclass(lexer, Lexer) + + if self.options.ambiguity == 'auto': + if self.options.parser == 'earley': + self.options.ambiguity = 'resolve' + else: + disambig_parsers = ['earley', 'cyk'] + assert self.options.parser in disambig_parsers, ( + 'Only %s supports disambiguation right now') % ', '.join(disambig_parsers) + + if self.options.priority == 'auto': + if self.options.parser in ('earley', 'cyk', ): + self.options.priority = 'normal' + elif self.options.parser in ('lalr', ): + self.options.priority = None + elif self.options.priority in ('invert', 'normal'): + assert self.options.parser in ('earley', 'cyk'), "priorities are not supported for LALR at this time" + + assert self.options.priority in ('auto', None, 'normal', 'invert'), 'invalid priority option specified: {}. options are auto, none, normal, invert.'.format(self.options.priority) + assert self.options.ambiguity not in ('resolve__antiscore_sum', ), 'resolve__antiscore_sum has been replaced with the option priority="invert"' + assert self.options.ambiguity in ('resolve', 'explicit', 'auto', ) + + # Parse the grammar file and compose the grammars (TODO) + self.grammar = load_grammar(grammar, self.source) + + # Compile the EBNF grammar into BNF + self.terminals, self.rules, self.ignore_tokens = self.grammar.compile() + + # If the user asked to invert the priorities, negate them all here. + # This replaces the old 'resolve__antiscore_sum' option. + if self.options.priority == 'invert': + for rule in self.rules: + if rule.options and rule.options.priority is not None: + rule.options.priority = -rule.options.priority + # Else, if the user asked to disable priorities, strip them from the + # rules. This allows the Earley parsers to skip an extra forest walk + # for improved performance, if you don't need them (or didn't specify any). + elif self.options.priority == None: + for rule in self.rules: + if rule.options and rule.options.priority is not None: + rule.options.priority = None + self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks) + + if self.options.parser: + self.parser = self._build_parser() + elif lexer: + self.lexer = self._build_lexer() + + if __init__.__doc__: + __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC + + __serialize_fields__ = 'parser', 'rules', 'options' + + def _build_lexer(self): + return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) + + def _prepare_callbacks(self): + self.parser_class = get_frontend(self.options.parser, self.options.lexer) + self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class or Tree, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit', self.options.maybe_placeholders) + self._callbacks = self._parse_tree_builder.create_callback(self.options.transformer) + + def _build_parser(self): + self._prepare_callbacks() + parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) + return self.parser_class(self.lexer_conf, parser_conf, options=self.options) + + @classmethod + def deserialize(cls, data, namespace, memo, transformer=None, postlex=None): + if memo: + memo = SerializeMemoizer.deserialize(memo, namespace, {}) + inst = cls.__new__(cls) + options = dict(data['options']) + options['transformer'] = transformer + options['postlex'] = postlex + inst.options = LarkOptions.deserialize(options, memo) + inst.rules = [Rule.deserialize(r, memo) for r in data['rules']] + inst._prepare_callbacks() + inst.parser = inst.parser_class.deserialize(data['parser'], memo, inst._callbacks, inst.options.postlex) + return inst + + + @classmethod + def open(cls, grammar_filename, rel_to=None, **options): + """Create an instance of Lark with the grammar given by its filename + + If rel_to is provided, the function will find the grammar filename in relation to it. + + Example: + + >>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr") + Lark(...) + + """ + if rel_to: + basepath = os.path.dirname(rel_to) + grammar_filename = os.path.join(basepath, grammar_filename) + with open(grammar_filename, encoding='utf8') as f: + return cls(f, **options) def __repr__(self): - return 'RuleOptions(%r, %r, %r, %r, %r)' % ( - self.keep_all_tokens, - self.expand1, - self.create_token, - self.priority, - self.filter_out - ) + return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source, self.options.parser, self.options.lexer) -Shift = 0 -Reduce = 1 -import re -MRES = ( -[(u'(?P(?:(?:\\+|\\-))?(?:(?:(?:[0-9])+(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+|(?:(?:[0-9])+\\.(?:(?:[0-9])+)?|\\.(?:[0-9])+)(?:(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+)?)|(?:[0-9])+))|(?P\\"(?:(?:\\\\\\"|[^"]))*\\")|(?P(?:[ \t\x0c\r\n])+)|(?P<__FALSE1>false)|(?P<__NULL2>null)|(?P<__TRUE0>true)|(?P<__COLON>\\:)|(?P<__COMMA>\\,)|(?P<__LBRACE>\\{)|(?P<__LSQB>\\[)|(?P<__RBRACE>\\})|(?P<__RSQB>\\])', - {1: u'SIGNED_NUMBER', - 2: u'ESCAPED_STRING', - 3: u'WS', - 4: u'__FALSE1', - 5: u'__NULL2', - 6: u'__TRUE0', - 7: u'__COLON', - 8: u'__COMMA', - 9: u'__LBRACE', - 10: u'__LSQB', - 11: u'__RBRACE', - 12: u'__RSQB'})] -) -LEXER_CALLBACK = ( -{} + + def lex(self, text): + "Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'" + if not hasattr(self, 'lexer'): + self.lexer = self._build_lexer() + stream = self.lexer.lex(text) + if self.options.postlex: + return self.options.postlex.process(stream) + return stream + + def parse(self, text): + "Parse the given text, according to the options provided. Returns a tree, unless specified otherwise." + return self.parser.parse(text) + + +DATA = ( +{'rules': [{'@': 27}, {'@': 31}, {'@': 26}, {'@': 13}, {'@': 24}, {'@': 18}, {'@': 16}, {'@': 23}, {'@': 21}, {'@': 17}, {'@': 28}, {'@': 30}, {'@': 25}, {'@': 29}, {'@': 20}, {'@': 22}, {'@': 15}, {'@': 19}, {'@': 12}, {'@': 14}], 'parser': {'parser': {'tokens': {0: 'COMMA', 1: 'RBRACE', 2: u'pair', 3: u'ESCAPED_STRING', 4: u'string', 5: 'COLON', 6: 'RSQB', 7: '$END', 8: 'LBRACE', 9: u'FALSE', 10: u'object', 11: u'SIGNED_NUMBER', 12: u'value', 13: 'LSQB', 14: u'NULL', 15: u'TRUE', 16: u'array', 17: '__anon_star_1', 18: '__anon_star_0', 19: 'start'}, 'states': {0: {0: (0, 1), 1: (0, 32)}, 1: {2: (0, 5), 3: (0, 21), 4: (0, 3)}, 2: {0: (1, {'@': 12}), 1: (1, {'@': 12})}, 3: {5: (0, 13)}, 4: {0: (1, {'@': 13}), 1: (1, {'@': 13}), 6: (1, {'@': 13}), 7: (1, {'@': 13})}, 5: {0: (1, {'@': 14}), 1: (1, {'@': 14})}, 6: {0: (1, {'@': 15}), 6: (1, {'@': 15})}, 7: {0: (1, {'@': 16}), 1: (1, {'@': 16}), 6: (1, {'@': 16}), 7: (1, {'@': 16})}, 8: {3: (0, 21), 4: (0, 4), 8: (0, 34), 9: (0, 7), 10: (0, 33), 11: (0, 25), 12: (0, 12), 13: (0, 14), 14: (0, 24), 15: (0, 11), 16: (0, 27)}, 9: {0: (1, {'@': 17}), 1: (1, {'@': 17}), 6: (1, {'@': 17}), 7: (1, {'@': 17})}, 10: {0: (0, 22), 17: (0, 0), 1: (0, 26)}, 11: {0: (1, {'@': 18}), 1: (1, {'@': 18}), 6: (1, {'@': 18}), 7: (1, {'@': 18})}, 12: {0: (1, {'@': 19}), 6: (1, {'@': 19})}, 13: {3: (0, 21), 4: (0, 4), 8: (0, 34), 9: (0, 7), 10: (0, 33), 11: (0, 25), 12: (0, 15), 13: (0, 14), 14: (0, 24), 15: (0, 11), 16: (0, 27)}, 14: {3: (0, 21), 4: (0, 4), 6: (0, 30), 8: (0, 34), 9: (0, 7), 10: (0, 33), 11: (0, 25), 12: (0, 23), 13: (0, 14), 14: (0, 24), 15: (0, 11), 16: (0, 27)}, 15: {0: (1, {'@': 20}), 1: (1, {'@': 20})}, 16: {0: (1, {'@': 21}), 1: (1, {'@': 21}), 6: (1, {'@': 21}), 7: (1, {'@': 21})}, 17: {3: (0, 21), 4: (0, 4), 8: (0, 34), 9: (0, 7), 10: (0, 33), 11: (0, 25), 12: (0, 6), 13: (0, 14), 14: (0, 24), 15: (0, 11), 16: (0, 27)}, 18: {}, 19: {7: (0, 18)}, 20: {0: (0, 8), 6: (0, 16)}, 21: {0: (1, {'@': 22}), 1: (1, {'@': 22}), 5: (1, {'@': 22}), 6: (1, {'@': 22}), 7: (1, {'@': 22})}, 22: {2: (0, 2), 3: (0, 21), 4: (0, 3)}, 23: {0: (0, 17), 18: (0, 20), 6: (0, 9)}, 24: {0: (1, {'@': 23}), 1: (1, {'@': 23}), 6: (1, {'@': 23}), 7: (1, {'@': 23})}, 25: {0: (1, {'@': 24}), 1: (1, {'@': 24}), 6: (1, {'@': 24}), 7: (1, {'@': 24})}, 26: {0: (1, {'@': 25}), 1: (1, {'@': 25}), 6: (1, {'@': 25}), 7: (1, {'@': 25})}, 27: {0: (1, {'@': 26}), 1: (1, {'@': 26}), 6: (1, {'@': 26}), 7: (1, {'@': 26})}, 28: {3: (0, 21), 4: (0, 4), 8: (0, 34), 9: (0, 7), 10: (0, 33), 11: (0, 25), 12: (0, 29), 13: (0, 14), 14: (0, 24), 15: (0, 11), 16: (0, 27), 19: (0, 19)}, 29: {7: (1, {'@': 27})}, 30: {0: (1, {'@': 28}), 1: (1, {'@': 28}), 6: (1, {'@': 28}), 7: (1, {'@': 28})}, 31: {0: (1, {'@': 29}), 1: (1, {'@': 29}), 6: (1, {'@': 29}), 7: (1, {'@': 29})}, 32: {0: (1, {'@': 30}), 1: (1, {'@': 30}), 6: (1, {'@': 30}), 7: (1, {'@': 30})}, 33: {0: (1, {'@': 31}), 1: (1, {'@': 31}), 6: (1, {'@': 31}), 7: (1, {'@': 31})}, 34: {1: (0, 31), 2: (0, 10), 3: (0, 21), 4: (0, 3)}}, 'end_state': 18, 'start_state': 28}, '__type__': 'LALR_TraditionalLexer', 'lexer': {'ignore_types': [u'WS'], 'terminals': [{'@': 0}, {'@': 1}, {'@': 2}, {'@': 3}, {'@': 4}, {'@': 5}, {'@': 6}, {'@': 7}, {'@': 8}, {'@': 9}, {'@': 10}, {'@': 11}], '__type__': 'TraditionalLexer', 'newline_types': [u'WS']}}, '__type__': 'Lark', 'options': {'profile': False, 'transformer': None, 'lexer': 'standard', 'lexer_callbacks': {}, 'postlex': None, 'parser': 'lalr', 'cache_grammar': False, 'tree_class': None, 'priority': None, 'start': 'start', 'keep_all_tokens': False, 'ambiguity': 'auto', 'debug': False, 'propagate_positions': False, 'maybe_placeholders': False}} ) -NEWLINE_TYPES = [u'WS'] -IGNORE_TYPES = [u'WS'] -class LexerRegexps: pass -lexer_regexps = LexerRegexps() -lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES] -lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres]) - for n, mres in LEXER_CALLBACK.items()} -lexer = _Lex(lexer_regexps) -def lex(stream): - return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES) -RULES = { - 0: Rule(u'start', [u'value'], None, RuleOptions(False, True, None, None, False)), - 1: Rule(u'value', [u'string'], None, RuleOptions(False, True, None, None, False)), - 2: Rule(u'value', [u'__TRUE0'], u'true', RuleOptions(False, True, None, None, False)), - 3: Rule(u'value', [u'array'], None, RuleOptions(False, True, None, None, False)), - 4: Rule(u'value', [u'__NULL2'], u'null', RuleOptions(False, True, None, None, False)), - 5: Rule(u'value', [u'SIGNED_NUMBER'], u'number', RuleOptions(False, True, None, None, False)), - 6: Rule(u'value', [u'object'], None, RuleOptions(False, True, None, None, False)), - 7: Rule(u'value', [u'__FALSE1'], u'false', RuleOptions(False, True, None, None, False)), - 8: Rule(u'array', ['__LSQB', u'value', '__RSQB'], None, RuleOptions(False, False, None, None, False)), - 9: Rule(u'array', ['__LSQB', u'value', '__anon_star_0', '__RSQB'], None, RuleOptions(False, False, None, None, False)), - 10: Rule(u'array', ['__LSQB', '__RSQB'], None, RuleOptions(False, False, None, None, False)), - 11: Rule(u'object', ['__LBRACE', u'pair', '__anon_star_1', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), - 12: Rule(u'object', ['__LBRACE', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), - 13: Rule(u'object', ['__LBRACE', u'pair', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), - 14: Rule(u'pair', [u'string', '__COLON', u'value'], None, RuleOptions(False, False, None, None, False)), - 15: Rule(u'string', [u'ESCAPED_STRING'], None, RuleOptions(False, False, None, None, False)), - 16: Rule('__anon_star_0', ['__anon_star_0', '__COMMA', u'value'], None, None), - 17: Rule('__anon_star_0', ['__COMMA', u'value'], None, None), - 18: Rule('__anon_star_1', ['__COMMA', u'pair'], None, None), - 19: Rule('__anon_star_1', ['__anon_star_1', '__COMMA', u'pair'], None, None), -} -parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree) -class ParseTable: pass -parse_table = ParseTable() -STATES = { - 0: {0: (1, 4), 1: (1, 4), 2: (1, 4), 3: (1, 4)}, - 1: {1: (1, 14), 2: (1, 14)}, - 2: {0: (0, 29), 1: (0, 32), 4: (0, 9)}, - 3: {1: (0, 13), 2: (0, 12)}, - 4: {0: (1, 1), 1: (1, 1), 2: (1, 1), 3: (1, 1)}, - 5: {0: (1, 10), 1: (1, 10), 2: (1, 10), 3: (1, 10)}, - 6: {2: (0, 15), 5: (0, 27), 6: (0, 16), 7: (0, 26)}, - 7: {5: (0, 34), 6: (0, 16), 7: (0, 26)}, - 8: {0: (1, 2), 1: (1, 2), 2: (1, 2), 3: (1, 2)}, - 9: {0: (0, 11), 1: (0, 22)}, - 10: {0: (1, 6), 1: (1, 6), 2: (1, 6), 3: (1, 6)}, - 11: {0: (1, 9), 1: (1, 9), 2: (1, 9), 3: (1, 9)}, - 12: {0: (1, 11), 1: (1, 11), 2: (1, 11), 3: (1, 11)}, - 13: {5: (0, 20), 6: (0, 16), 7: (0, 26)}, - 14: {6: (0, 16), 7: (0, 4), 8: (0, 6), 9: (0, 31), 10: (0, 24), 11: (0, 10), 12: (0, 21), 13: (0, 17), 14: (0, 33), 15: (0, 0), 16: (0, 19), 17: (0, 8)}, - 15: {0: (1, 12), 1: (1, 12), 2: (1, 12), 3: (1, 12)}, - 16: {0: (1, 15), 1: (1, 15), 2: (1, 15), 3: (1, 15), 18: (1, 15)}, - 17: {3: (1, 0)}, - 18: {}, - 19: {0: (1, 3), 1: (1, 3), 2: (1, 3), 3: (1, 3)}, - 20: {1: (1, 19), 2: (1, 19)}, - 21: {0: (1, 5), 1: (1, 5), 2: (1, 5), 3: (1, 5)}, - 22: {6: (0, 16), 7: (0, 4), 8: (0, 6), 9: (0, 31), 10: (0, 24), 11: (0, 10), 12: (0, 21), 13: (0, 30), 15: (0, 0), 16: (0, 19), 17: (0, 8)}, - 23: {6: (0, 16), 7: (0, 4), 8: (0, 6), 9: (0, 31), 10: (0, 24), 11: (0, 10), 12: (0, 21), 13: (0, 1), 15: (0, 0), 16: (0, 19), 17: (0, 8)}, - 24: {0: (0, 5), 6: (0, 16), 7: (0, 4), 8: (0, 6), 9: (0, 31), 10: (0, 24), 11: (0, 10), 12: (0, 21), 13: (0, 2), 15: (0, 0), 16: (0, 19), 17: (0, 8)}, - 25: {0: (1, 13), 1: (1, 13), 2: (1, 13), 3: (1, 13)}, - 26: {18: (0, 23)}, - 27: {1: (0, 7), 2: (0, 25), 19: (0, 3)}, - 28: {0: (1, 17), 1: (1, 17)}, - 29: {0: (1, 8), 1: (1, 8), 2: (1, 8), 3: (1, 8)}, - 30: {0: (1, 16), 1: (1, 16)}, - 31: {0: (1, 7), 1: (1, 7), 2: (1, 7), 3: (1, 7)}, - 32: {6: (0, 16), 7: (0, 4), 8: (0, 6), 9: (0, 31), 10: (0, 24), 11: (0, 10), 12: (0, 21), 13: (0, 28), 15: (0, 0), 16: (0, 19), 17: (0, 8)}, - 33: {3: (0, 18)}, - 34: {1: (1, 18), 2: (1, 18)}, -} -TOKEN_TYPES = ( -{0: '__RSQB', - 1: '__COMMA', - 2: '__RBRACE', - 3: '$END', - 4: '__anon_star_0', - 5: u'pair', - 6: u'ESCAPED_STRING', - 7: u'string', - 8: '__LBRACE', - 9: u'__FALSE1', - 10: '__LSQB', - 11: u'object', - 12: u'SIGNED_NUMBER', - 13: u'value', - 14: 'start', - 15: u'__NULL2', - 16: u'array', - 17: u'__TRUE0', - 18: '__COLON', - 19: '__anon_star_1'} +MEMO = ( +{0: {'priority': 1, 'pattern': {'__type__': 'PatternRE', 'flags': [], 'value': u'(?:(?:\\+|\\-))?(?:(?:(?:[0-9])+(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+|(?:(?:[0-9])+\\.(?:(?:[0-9])+)?|\\.(?:[0-9])+)(?:(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+)?)|(?:[0-9])+)'}, '__type__': 'TerminalDef', 'name': u'SIGNED_NUMBER'}, 1: {'priority': 1, 'pattern': {'__type__': 'PatternRE', 'flags': [], 'value': u'\\".*?(?