diff --git a/examples/standalone/json_parser.py b/examples/standalone/json_parser.py index 8a92a14..c9a5147 100644 --- a/examples/standalone/json_parser.py +++ b/examples/standalone/json_parser.py @@ -1,4 +1,4 @@ -# The file was automatically generated by Lark v0.8.1 +# The file was automatically generated by Lark v0.9.0 # # # Lark Stand-alone Generator Tool @@ -58,14 +58,14 @@ class UnexpectedInput(LarkError): after = text[pos:end].split('\n', 1)[0] return before + after + '\n' + ' ' * len(before) + '^\n' - def match_examples(self, parse_fn, examples): + def match_examples(self, parse_fn, examples, token_type_match_fallback=False): """ Given a parser instance and a dictionary mapping some label with some malformed syntax examples, it'll return the label for the example that bests matches the current error. """ assert self.state is not None, "Not supported for this exception" - candidate = None + candidate = (None, False) for label, example in examples.items(): assert not isinstance(example, STRING_TYPE) @@ -77,12 +77,18 @@ class UnexpectedInput(LarkError): try: if ut.token == self.token: # Try exact match first return label + + if token_type_match_fallback: + # Fallback to token types match + if (ut.token.type == self.token.type) and not candidate[-1]: + candidate = label, True + except AttributeError: pass - if not candidate: - candidate = label + if not candidate[0]: + candidate = label, False - return candidate + return candidate[0] class UnexpectedCharacters(LexError, UnexpectedInput): @@ -107,7 +113,7 @@ class UnexpectedCharacters(LexError, UnexpectedInput): class UnexpectedToken(ParseError, UnexpectedInput): - def __init__(self, token, expected, considered_rules=None, state=None): + def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): self.token = token self.expected = expected # XXX str shouldn't necessary self.line = getattr(token, 'line', '?') @@ -115,6 +121,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): self.considered_rules = considered_rules self.state = state self.pos_in_stream = getattr(token, 'pos_in_stream', None) + self.puppet = puppet message = ("Unexpected token %r at line %s, column %s.\n" "Expected one of: \n\t* %s\n" @@ -123,6 +130,12 @@ class UnexpectedToken(ParseError, UnexpectedInput): super(UnexpectedToken, self).__init__(message) class VisitError(LarkError): + """VisitError is raised when visitors are interrupted by an exception + + It provides the following attributes for inspection: + - obj: the tree node or token it was processing when the exception was raised + - orig_exc: the exception that cause it to fail + """ def __init__(self, rule, obj, orig_exc): self.obj = obj self.orig_exc = orig_exc @@ -246,16 +259,31 @@ def smart_decorator(f, create_decorator): else: return create_decorator(f.__func__.__call__, True) +try: + import regex +except ImportError: + regex = None + import sys, re Py36 = (sys.version_info[:2] >= (3, 6)) import sre_parse import sre_constants -def get_regexp_width(regexp): +categ_pattern = re.compile(r'\\p{[A-Za-z_]+}') +def get_regexp_width(expr): + if regex: + # Since `sre_parse` cannot deal with Unicode categories of the form `\p{Mn}`, we replace these with + # a simple letter, which makes no difference as we are only trying to get the possible lengths of the regex + # match here below. + regexp_final = re.sub(categ_pattern, 'A', expr) + else: + if re.search(categ_pattern, expr): + raise ImportError('`regex` module must be installed in order to use Unicode categories.', expr) + regexp_final = expr try: - return [int(x) for x in sre_parse.parse(regexp).getwidth()] + return [int(x) for x in sre_parse.parse(regexp_final).getwidth()] except sre_constants.error: - raise ValueError(regexp) + raise ValueError(expr) class Meta: @@ -309,25 +337,15 @@ class Tree(object): return hash((self.data, tuple(self.children))) def iter_subtrees(self): - # TODO: Re-write as a more efficient version - - visited = set() - q = [self] + queue = [self] + subtrees = OrderedDict() + for subtree in queue: + subtrees[id(subtree)] = subtree + queue += [c for c in reversed(subtree.children) + if isinstance(c, Tree) and id(c) not in subtrees] - l = [] - while q: - subtree = q.pop() - l.append( subtree ) - if id(subtree) in visited: - continue # already been here from another branch - visited.add(id(subtree)) - q += [c for c in subtree.children if isinstance(c, Tree)] - - seen = set() - for x in reversed(l): - if id(x) not in seen: - yield x - seen.add(id(x)) + del queue + return reversed(list(subtrees.values())) def find_pred(self, pred): "Find all nodes where pred(tree) == True" @@ -356,11 +374,11 @@ class _Decoratable: # Make sure the function isn't inherited (unless it's overwritten) if name.startswith('_') or (name in libmembers and name not in cls.__dict__): continue - if not callable(cls.__dict__[name]): + if not callable(value): continue # Skip if v_args already applied (at the function level) - if hasattr(cls.__dict__[name], 'vargs_applied'): + if hasattr(cls.__dict__[name], 'vargs_applied') or hasattr(value, 'vargs_applied'): continue static = isinstance(cls.__dict__[name], (staticmethod, classmethod)) @@ -486,6 +504,38 @@ class Transformer_InPlace(Transformer): return self._transform_tree(tree) +class Transformer_NonRecursive(Transformer): + "Non-recursive. Doesn't change the original tree." + + def transform(self, tree): + # Tree to postfix + rev_postfix = [] + q = [tree] + while q: + t = q.pop() + rev_postfix.append( t ) + if isinstance(t, Tree): + q += t.children + + # Postfix to tree + stack = [] + for x in reversed(rev_postfix): + if isinstance(x, Tree): + size = len(x.children) + if size: + args = stack[-size:] + del stack[-size:] + else: + args = [] + stack.append(self._call_userfunc(x, args)) + else: + stack.append(x) + + t ,= stack # We should have only one tree remaining + return t + + + class Transformer_InPlaceRecursive(Transformer): "Recursive. Changes the tree in-place instead of returning new instances" def _transform_tree(self, tree): @@ -567,7 +617,7 @@ class Interpreter(_Decoratable): Calls its methods (provided by user via inheritance) according to tree.data Unlike Transformer and Visitor, the Interpreter doesn't automatically visit its sub-branches. - The user has to explicitly call visit_children, or use the @visit_children_decor + The user has to explicitly call visit, visit_children, or use the @visit_children_decor """ def visit(self, tree): @@ -781,19 +831,21 @@ class NonTerminal(Symbol): class RuleOptions(Serialize): - __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'empty_indices' + __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'template_source', 'empty_indices' - def __init__(self, keep_all_tokens=False, expand1=False, priority=None, empty_indices=()): + def __init__(self, keep_all_tokens=False, expand1=False, priority=None, template_source=None, empty_indices=()): self.keep_all_tokens = keep_all_tokens self.expand1 = expand1 self.priority = priority + self.template_source = template_source self.empty_indices = empty_indices def __repr__(self): - return 'RuleOptions(%r, %r, %r)' % ( + return 'RuleOptions(%r, %r, %r, %r)' % ( self.keep_all_tokens, self.expand1, self.priority, + self.template_source ) @@ -836,6 +888,7 @@ class Rule(Serialize): +from copy import copy class Pattern(Serialize): @@ -918,7 +971,6 @@ class TerminalDef(Serialize): return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) - class Token(Str): __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') @@ -926,7 +978,8 @@ class Token(Str): try: self = super(Token, cls).__new__(cls, value) except UnicodeDecodeError: - value = value.decode('latin1') + # value = value.decode('latin1') + value = value.decode("ascii", "backslashreplace") self = super(Token, cls).__new__(cls, value) self.type = type_ @@ -1060,7 +1113,7 @@ class CallChain: -def _create_unless(terminals): +def _create_unless(terminals, g_regex_flags, re_): tokens_by_type = classify(terminals, lambda t: type(t.pattern)) assert len(tokens_by_type) <= 2, tokens_by_type.keys() embedded_strs = set() @@ -1071,19 +1124,19 @@ def _create_unless(terminals): if strtok.priority > retok.priority: continue s = strtok.pattern.value - m = re.match(retok.pattern.to_regexp(), s) + m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags) if m and m.group(0) == s: unless.append(strtok) if strtok.pattern.flags <= retok.pattern.flags: embedded_strs.add(strtok) if unless: - callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True)) + callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True)) terminals = [t for t in terminals if t not in embedded_strs] return terminals, callback -def _build_mres(terminals, max_size, match_whole): +def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_): # Python sets an unreasonable group limit (currently 100) in its re module # Worse, the only way to know we reached it is by catching an AssertionError! # This function recursively tries less and less groups until it's successful. @@ -1091,17 +1144,17 @@ def _build_mres(terminals, max_size, match_whole): mres = [] while terminals: try: - mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size])) + mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags) except AssertionError: # Yes, this is what Python provides us.. :/ - return _build_mres(terminals, max_size//2, match_whole) + return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_) # terms_from_name = {t.name: t for t in terminals[:max_size]} mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) terminals = terminals[max_size:] return mres -def build_mres(terminals, match_whole=False): - return _build_mres(terminals, len(terminals), match_whole) +def build_mres(terminals, g_regex_flags, re_, match_whole=False): + return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_) def _regexp_has_newline(r): r"""Expressions that may indicate newlines in a regexp: @@ -1124,34 +1177,39 @@ class Lexer(object): class TraditionalLexer(Lexer): - def __init__(self, terminals, ignore=(), user_callbacks={}): + def __init__(self, conf): + terminals = list(conf.tokens) assert all(isinstance(t, TerminalDef) for t in terminals), terminals - terminals = list(terminals) + self.re = conf.re_module - # Sanitization - for t in terminals: - try: - re.compile(t.pattern.to_regexp()) - except re.error: - raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) + if not conf.skip_validation: + # Sanitization + for t in terminals: + try: + self.re.compile(t.pattern.to_regexp(), conf.g_regex_flags) + except self.re.error: + raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) - if t.pattern.min_width == 0: - raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern)) + if t.pattern.min_width == 0: + raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern)) - assert set(ignore) <= {t.name for t in terminals} + assert set(conf.ignore) <= {t.name for t in terminals} # Init self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())] - self.ignore_types = list(ignore) + self.ignore_types = list(conf.ignore) terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) self.terminals = terminals - self.user_callbacks = user_callbacks - self.build() + self.user_callbacks = conf.callbacks + self.g_regex_flags = conf.g_regex_flags - def build(self): - terminals, self.callback = _create_unless(self.terminals) + self._mres = None + # self.build(g_regex_flags) + + def _build(self): + terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re) assert all(self.callback.values()) for type_, f in self.user_callbacks.items(): @@ -1161,7 +1219,13 @@ class TraditionalLexer(Lexer): else: self.callback[type_] = f - self.mres = build_mres(terminals) + self._mres = build_mres(terminals, self.g_regex_flags, self.re) + + @property + def mres(self): + if self._mres is None: + self._build() + return self._mres def match(self, stream, pos): for mre, type_from_index in self.mres: @@ -1177,12 +1241,15 @@ class TraditionalLexer(Lexer): class ContextualLexer(Lexer): - def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): + def __init__(self, conf, states, always_accept=()): + terminals = list(conf.tokens) tokens_by_name = {} for t in terminals: assert t.name not in tokens_by_name, t tokens_by_name[t.name] = t + trad_conf = type(conf)(terminals, conf.re_module, conf.ignore, callbacks=conf.callbacks, g_regex_flags=conf.g_regex_flags, skip_validation=conf.skip_validation) + lexer_by_tokens = {} self.lexers = {} for state, accepts in states.items(): @@ -1190,14 +1257,17 @@ class ContextualLexer(Lexer): try: lexer = lexer_by_tokens[key] except KeyError: - accepts = set(accepts) | set(ignore) | set(always_accept) + accepts = set(accepts) | set(conf.ignore) | set(always_accept) state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] - lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) + lexer_conf = copy(trad_conf) + lexer_conf.tokens = state_tokens + lexer = TraditionalLexer(lexer_conf) lexer_by_tokens[key] = lexer self.lexers[state] = lexer - self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks) + assert trad_conf.tokens is terminals + self.root_lexer = TraditionalLexer(trad_conf) def lex(self, stream, get_parser_state): parser_state = get_parser_state() @@ -1223,14 +1293,17 @@ class ContextualLexer(Lexer): class LexerConf(Serialize): - __serialize_fields__ = 'tokens', 'ignore' + __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags' __serialize_namespace__ = TerminalDef, - def __init__(self, tokens, ignore=(), postlex=None, callbacks=None): - self.tokens = tokens + def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False): + self.tokens = tokens # TODO should be terminals self.ignore = ignore self.postlex = postlex self.callbacks = callbacks or {} + self.g_regex_flags = g_regex_flags + self.re_module = re_module + self.skip_validation = skip_validation def _deserialize(self): self.callbacks = {} # TODO @@ -1257,33 +1330,39 @@ class PropagatePositions: def __call__(self, children): res = self.node_builder(children) + # local reference to Tree.meta reduces number of presence checks if isinstance(res, Tree): + res_meta = res.meta for c in children: - if isinstance(c, Tree) and not c.meta.empty: - res.meta.line = c.meta.line - res.meta.column = c.meta.column - res.meta.start_pos = c.meta.start_pos - res.meta.empty = False - break + if isinstance(c, Tree): + child_meta = c.meta + if not child_meta.empty: + res_meta.line = child_meta.line + res_meta.column = child_meta.column + res_meta.start_pos = child_meta.start_pos + res_meta.empty = False + break elif isinstance(c, Token): - res.meta.line = c.line - res.meta.column = c.column - res.meta.start_pos = c.pos_in_stream - res.meta.empty = False + res_meta.line = c.line + res_meta.column = c.column + res_meta.start_pos = c.pos_in_stream + res_meta.empty = False break for c in reversed(children): - if isinstance(c, Tree) and not c.meta.empty: - res.meta.end_line = c.meta.end_line - res.meta.end_column = c.meta.end_column - res.meta.end_pos = c.meta.end_pos - res.meta.empty = False - break + if isinstance(c, Tree): + child_meta = c.meta + if not child_meta.empty: + res_meta.end_line = child_meta.end_line + res_meta.end_column = child_meta.end_column + res_meta.end_pos = child_meta.end_pos + res_meta.empty = False + break elif isinstance(c, Token): - res.meta.end_line = c.end_line - res.meta.end_column = c.end_column - res.meta.end_pos = c.end_pos - res.meta.empty = False + res_meta.end_line = c.end_line + res_meta.end_column = c.end_column + res_meta.end_pos = c.end_pos + res_meta.empty = False break return res @@ -1473,7 +1552,7 @@ class ParseTreeBuilder: for rule, wrapper_chain in self.rule_builders: - user_callback_name = rule.alias or rule.origin.name + user_callback_name = rule.alias or rule.options.template_source or rule.origin.name try: f = getattr(transformer, user_callback_name) # XXX InlineTransformer is deprecated! @@ -1499,6 +1578,7 @@ class ParseTreeBuilder: return callbacks + class LALR_Parser(object): def __init__(self, parser_conf, debug=False): assert all(r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization" @@ -1508,7 +1588,7 @@ class LALR_Parser(object): self._parse_table = analysis.parse_table self.parser_conf = parser_conf - self.parser = _Parser(analysis.parse_table, callbacks) + self.parser = _Parser(analysis.parse_table, callbacks, debug) @classmethod def deserialize(cls, data, memo, callbacks): @@ -1525,22 +1605,20 @@ class LALR_Parser(object): class _Parser: - def __init__(self, parse_table, callbacks): - self.states = parse_table.states - self.start_states = parse_table.start_states - self.end_states = parse_table.end_states + def __init__(self, parse_table, callbacks, debug=False): + self.parse_table = parse_table self.callbacks = callbacks + self.debug = debug - def parse(self, seq, start, set_state=None): + def parse(self, seq, start, set_state=None, value_stack=None, state_stack=None): token = None stream = iter(seq) - states = self.states + states = self.parse_table.states + start_state = self.parse_table.start_states[start] + end_state = self.parse_table.end_states[start] - start_state = self.start_states[start] - end_state = self.end_states[start] - - state_stack = [start_state] - value_stack = [] + state_stack = state_stack or [start_state] + value_stack = value_stack or [] if set_state: set_state(start_state) @@ -1550,7 +1628,11 @@ class _Parser: return states[state][token.type] except KeyError: expected = [s for s in states[state].keys() if s.isupper()] - raise UnexpectedToken(token, expected, state=state) + try: + puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) + except NameError: + puppet = None + raise UnexpectedToken(token, expected, state=state, puppet=puppet) def reduce(rule): size = len(rule.expansion) @@ -1569,18 +1651,29 @@ class _Parser: value_stack.append(value) # Main LALR-parser loop - for token in stream: - while True: - action, arg = get_action(token) - assert arg != end_state - - if action is Shift: - state_stack.append(arg) - value_stack.append(token) - if set_state: set_state(arg) - break # next token - else: - reduce(arg) + try: + for token in stream: + while True: + action, arg = get_action(token) + assert arg != end_state + + if action is Shift: + state_stack.append(arg) + value_stack.append(token) + if set_state: set_state(arg) + break # next token + else: + reduce(arg) + except Exception as e: + if self.debug: + print("") + print("STATE STACK DUMP") + print("----------------") + for i, s in enumerate(state_stack): + print('%d)' % i , s) + print("") + + raise token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) while True: @@ -1715,10 +1808,12 @@ class WithLexer(_ParserFrontend): self.postlex = lexer_conf.postlex @classmethod - def deserialize(cls, data, memo, callbacks, postlex): + def deserialize(cls, data, memo, callbacks, postlex, re_module): inst = super(WithLexer, cls).deserialize(data, memo) inst.postlex = postlex inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) + inst.lexer_conf.re_module = re_module + inst.lexer_conf.skip_validation=True inst.init_lexer() return inst @@ -1734,7 +1829,7 @@ class WithLexer(_ParserFrontend): return self._parse(token_stream, start) def init_traditional_lexer(self): - self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) + self.lexer = TraditionalLexer(self.lexer_conf) class LALR_WithLexer(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): @@ -1744,7 +1839,7 @@ class LALR_WithLexer(WithLexer): self.init_lexer() - def init_lexer(self): + def init_lexer(self, **kw): raise NotImplementedError() class LALR_TraditionalLexer(LALR_WithLexer): @@ -1755,10 +1850,7 @@ class LALR_ContextualLexer(LALR_WithLexer): def init_lexer(self): states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} always_accept = self.postlex.always_accept if self.postlex else () - self.lexer = ContextualLexer(self.lexer_conf.tokens, states, - ignore=self.lexer_conf.ignore, - always_accept=always_accept, - user_callbacks=self.lexer_conf.callbacks) + self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept) def parse(self, text, start=None): @@ -1775,32 +1867,62 @@ class LarkOptions(Serialize): """ OPTIONS_DOC = """ - parser - Decides which parser engine to use, "earley" or "lalr". (Default: "earley") - Note: "lalr" requires a lexer - - lexer - Decides whether or not to use a lexer stage - "standard": Use a standard lexer - "contextual": Stronger lexer (only works with parser="lalr") - "dynamic": Flexible and powerful (only with parser="earley") - "dynamic_complete": Same as dynamic, but tries *every* variation - of tokenizing possible. (only with parser="earley") - "auto" (default): Choose for me based on grammar and parser - - ambiguity - Decides how to handle ambiguity in the parse. Only relevant if parser="earley" - "resolve": The parser will automatically choose the simplest derivation - (it chooses consistently: greedy for tokens, non-greedy for rules) - "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). - - transformer - Applies the transformer to every parse tree - debug - Affects verbosity (default: False) - keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False) - cache_grammar - Cache the Lark grammar (Default: False) - postlex - Lexer post-processing (Default: None) Only works with the standard and contextual lexers. - start - The start symbol, either a string, or a list of strings for multiple possible starts (Default: "start") - priority - How priorities should be evaluated - auto, none, normal, invert (Default: auto) - propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches. - lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. - maybe_placeholders - Experimental feature. Instead of omitting optional rules (i.e. rule?), replace them with None +# General + + start - The start symbol. Either a string, or a list of strings for + multiple possible starts (Default: "start") + debug - Display debug information, such as warnings (default: False) + transformer - Applies the transformer to every parse tree (equivlent to + applying it after the parse, but faster) + propagate_positions - Propagates (line, column, end_line, end_column) + attributes into all tree branches. + maybe_placeholders - When True, the `[]` operator returns `None` when not matched. + When `False`, `[]` behaves like the `?` operator, + and returns no value at all. + (default=`False`. Recommended to set to `True`) + regex - When True, uses the `regex` module instead of the stdlib `re`. + cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. + LALR only for now. + When `False`, does nothing (default) + When `True`, caches to a temporary file in the local directory + When given a string, caches to the path pointed by the string + + g_regex_flags - Flags that are applied to all terminals + (both regex and strings) + keep_all_tokens - Prevent the tree builder from automagically + removing "punctuation" tokens (default: False) + +# Algorithm + + parser - Decides which parser engine to use + Accepts "earley" or "lalr". (Default: "earley") + (there is also a "cyk" option for legacy) + + lexer - Decides whether or not to use a lexer stage + "auto" (default): Choose for me based on the parser + "standard": Use a standard lexer + "contextual": Stronger lexer (only works with parser="lalr") + "dynamic": Flexible and powerful (only with parser="earley") + "dynamic_complete": Same as dynamic, but tries *every* variation + of tokenizing possible. + + ambiguity - Decides how to handle ambiguity in the parse. + Only relevant if parser="earley" + "resolve": The parser will automatically choose the simplest + derivation (it chooses consistently: greedy for + tokens, non-greedy for rules) + "explicit": The parser will return all derivations wrapped + in "_ambig" tree nodes (i.e. a forest). + +# Domain Specific + + postlex - Lexer post-processing (Default: None) Only works with the + standard and contextual lexers. + priority - How priorities should be evaluated - auto, none, normal, + invert (Default: auto) + lexer_callbacks - Dictionary of callbacks for the lexer. May alter + tokens during lexing. Use with caution. + edit_terminals - A callback """ if __doc__: __doc__ += OPTIONS_DOC @@ -1809,7 +1931,7 @@ class LarkOptions(Serialize): 'debug': False, 'keep_all_tokens': False, 'tree_class': None, - 'cache: False, + 'cache': False, 'postlex': None, 'parser': 'earley', 'lexer': 'auto', @@ -1817,10 +1939,12 @@ class LarkOptions(Serialize): 'start': 'start', 'priority': 'auto', 'ambiguity': 'auto', + 'regex': False, 'propagate_positions': False, 'lexer_callbacks': {}, 'maybe_placeholders': False, 'edit_terminals': None, + 'g_regex_flags': 0, } def __init__(self, options_dict): @@ -1830,7 +1954,7 @@ class LarkOptions(Serialize): for name, default in self._defaults.items(): if name in o: value = o.pop(name) - if isinstance(default, bool): + if isinstance(default, bool) and name != 'cache': value = bool(value) else: value = default @@ -1875,8 +1999,19 @@ class Lark(Serialize): grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax) options : a dictionary controlling various aspects of Lark. """ + self.options = LarkOptions(options) + # Set regex or re module + use_regex = self.options.regex + if use_regex: + if regex: + re_module = regex + else: + raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.') + else: + re_module = re + # Some, but not all file-like objects have a 'name' attribute try: self.source = grammar.name @@ -1893,8 +2028,27 @@ class Lark(Serialize): assert isinstance(grammar, STRING_TYPE) - if self.options.cache_grammar: - raise NotImplementedError("Not available yet") + cache_fn = None + if self.options.cache: + if self.options.parser != 'lalr': + raise NotImplementedError("cache only works with parser='lalr' for now") + if isinstance(self.options.cache, STRING_TYPE): + cache_fn = self.options.cache + else: + if self.options.cache is not True: + raise ValueError("cache must be bool or str") + unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals') + from . import __version__ + options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) + s = grammar + options_str + __version__ + md5 = hashlib.md5(s.encode()).hexdigest() + cache_fn = '.lark_cache_%s.tmp' % md5 + + if FS.exists(cache_fn): + logging.debug('Loading grammar from cache: %s', cache_fn) + with FS.open(cache_fn, 'rb') as f: + self._load(f, self.options.transformer, self.options.postlex) + return if self.options.lexer == 'auto': if self.options.parser == 'lalr': @@ -1929,7 +2083,7 @@ class Lark(Serialize): assert self.options.ambiguity in ('resolve', 'explicit', 'auto', ) # Parse the grammar file and compose the grammars (TODO) - self.grammar = load_grammar(grammar, self.source) + self.grammar = load_grammar(grammar, self.source, re_module) # Compile the EBNF grammar into BNF self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) @@ -1962,20 +2116,25 @@ class Lark(Serialize): if hasattr(t, term.name): lexer_callbacks[term.name] = getattr(t, term.name) - self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, lexer_callbacks) + self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags) if self.options.parser: self.parser = self._build_parser() elif lexer: self.lexer = self._build_lexer() + if cache_fn: + logging.debug('Saving grammar to cache: %s', cache_fn) + with FS.open(cache_fn, 'wb') as f: + self.save(f) + if __init__.__doc__: - __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC + __init__.__doc__ += "\nOptions:\n" + LarkOptions.OPTIONS_DOC __serialize_fields__ = 'parser', 'rules', 'options' def _build_lexer(self): - return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) + return TraditionalLexer(self.lexer_conf) def _prepare_callbacks(self): self.parser_class = get_frontend(self.options.parser, self.options.lexer) @@ -1987,34 +2146,42 @@ class Lark(Serialize): parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) return self.parser_class(self.lexer_conf, parser_conf, options=self.options) + def save(self, f): + data, m = self.memo_serialize([TerminalDef, Rule]) + pickle.dump({'data': data, 'memo': m}, f) + @classmethod - def deserialize(cls, data, namespace, memo, transformer=None, postlex=None): - if memo: - memo = SerializeMemoizer.deserialize(memo, namespace, {}) + def load(cls, f): inst = cls.__new__(cls) + return inst._load(f) + + def _load(self, f, transformer=None, postlex=None): + if isinstance(f, dict): + d = f + else: + d = pickle.load(f) + memo = d['memo'] + data = d['data'] + + assert memo + memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {}) options = dict(data['options']) if transformer is not None: options['transformer'] = transformer if postlex is not None: options['postlex'] = postlex - inst.options = LarkOptions.deserialize(options, memo) - inst.rules = [Rule.deserialize(r, memo) for r in data['rules']] - inst.source = '' - inst._prepare_callbacks() - inst.parser = inst.parser_class.deserialize(data['parser'], memo, inst._callbacks, inst.options.postlex) - return inst - - def save(self, f): - data, m = self.memo_serialize([TerminalDef, Rule]) - pickle.dump({'data': data, 'memo': m}, f) + self.options = LarkOptions.deserialize(options, memo) + re_module = regex if self.options.regex else re + self.rules = [Rule.deserialize(r, memo) for r in data['rules']] + self.source = '' + self._prepare_callbacks() + self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, re_module) + return self @classmethod - def load(cls, f): - d = pickle.load(f) - namespace = {'Rule': Rule, 'TerminalDef': TerminalDef} - memo = d['memo'] - return Lark.deserialize(d['data'], namespace, memo) - + def _load_from_dict(cls, data, memo, transformer=None, postlex=None): + inst = cls.__new__(cls) + return inst._load({'data': data, 'memo': memo}, transformer, postlex) @classmethod def open(cls, grammar_filename, rel_to=None, **options): @@ -2051,24 +2218,38 @@ class Lark(Serialize): "Get information about a terminal" return self._terminals_dict[name] - def parse(self, text, start=None): + def parse(self, text, start=None, on_error=None): """Parse the given text, according to the options provided. - The 'start' parameter is required if Lark was given multiple possible start symbols (using the start option). + Parameters: + start: str - required if Lark was given multiple possible start symbols (using the start option). + on_error: function - if provided, will be called on UnexpectedToken error. Return true to resume parsing. LALR only. Returns a tree, unless specified otherwise. """ - return self.parser.parse(text, start=start) + try: + return self.parser.parse(text, start=start) + except UnexpectedToken as e: + if on_error is None: + raise + + while True: + if not on_error(e): + raise e + try: + return e.puppet.resume_parse() + except UnexpectedToken as e2: + e = e2 + DATA = ( -{'rules': [{'@': 27}, {'@': 31}, {'@': 26}, {'@': 13}, {'@': 24}, {'@': 18}, {'@': 14}, {'@': 22}, {'@': 28}, {'@': 23}, {'@': 29}, {'@': 12}, {'@': 25}, {'@': 30}, {'@': 19}, {'@': 21}, {'@': 15}, {'@': 20}, {'@': 16}, {'@': 17}], 'parser': {'lexer_conf': {'tokens': [{'@': 0}, {'@': 1}, {'@': 2}, {'@': 3}, {'@': 4}, {'@': 5}, {'@': 6}, {'@': 7}, {'@': 8}, {'@': 9}, {'@': 10}, {'@': 11}], 'ignore': [u'WS'], '__type__': 'LexerConf'}, 'parser': {'tokens': {0: 'COMMA', 1: 'RSQB', 2: 'RBRACE', 3: '$END', 4: 'LBRACE', 5: u'FALSE', 6: u'string', 7: u'object', 8: u'NULL', 9: u'SIGNED_NUMBER', 10: u'value', 11: u'array', 12: u'ESCAPED_STRING', 13: u'TRUE', 14: 'LSQB', 15: 'COLON', 16: u'pair', 17: u'__array_star_0', 18: u'__object_star_1', 19: 'start'}, 'states': {0: {0: (1, {'@': 12}), 1: (1, {'@': 12}), 2: (1, {'@': 12}), 3: (1, {'@': 12})}, 1: {1: (0, 29), 4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 22), 9: (0, 24), 10: (0, 6), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 2: {0: (0, 11), 2: (0, 0)}, 3: {15: (0, 12)}, 4: {16: (0, 13), 12: (0, 21), 6: (0, 3)}, 5: {0: (1, {'@': 13}), 1: (1, {'@': 13}), 2: (1, {'@': 13}), 3: (1, {'@': 13})}, 6: {0: (0, 7), 1: (0, 23), 17: (0, 17)}, 7: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 22), 9: (0, 24), 10: (0, 9), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 8: {0: (1, {'@': 14}), 1: (1, {'@': 14}), 2: (1, {'@': 14}), 3: (1, {'@': 14})}, 9: {0: (1, {'@': 15}), 1: (1, {'@': 15})}, 10: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 22), 9: (0, 24), 10: (0, 20), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 11: {16: (0, 15), 12: (0, 21), 6: (0, 3)}, 12: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 22), 9: (0, 24), 10: (0, 18), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 13: {0: (1, {'@': 16}), 2: (1, {'@': 16})}, 14: {}, 15: {0: (1, {'@': 17}), 2: (1, {'@': 17})}, 16: {0: (1, {'@': 18}), 1: (1, {'@': 18}), 2: (1, {'@': 18}), 3: (1, {'@': 18})}, 17: {0: (0, 10), 1: (0, 28)}, 18: {0: (1, {'@': 19}), 2: (1, {'@': 19})}, 19: {0: (0, 4), 18: (0, 2), 2: (0, 25)}, 20: {0: (1, {'@': 20}), 1: (1, {'@': 20})}, 21: {0: (1, {'@': 21}), 1: (1, {'@': 21}), 2: (1, {'@': 21}), 3: (1, {'@': 21}), 15: (1, {'@': 21})}, 22: {0: (1, {'@': 22}), 1: (1, {'@': 22}), 2: (1, {'@': 22}), 3: (1, {'@': 22})}, 23: {0: (1, {'@': 23}), 1: (1, {'@': 23}), 2: (1, {'@': 23}), 3: (1, {'@': 23})}, 24: {0: (1, {'@': 24}), 1: (1, {'@': 24}), 2: (1, {'@': 24}), 3: (1, {'@': 24})}, 25: {0: (1, {'@': 25}), 1: (1, {'@': 25}), 2: (1, {'@': 25}), 3: (1, {'@': 25})}, 26: {0: (1, {'@': 26}), 1: (1, {'@': 26}), 2: (1, {'@': 26}), 3: (1, {'@': 26})}, 27: {3: (1, {'@': 27})}, 28: {0: (1, {'@': 28}), 1: (1, {'@': 28}), 2: (1, {'@': 28}), 3: (1, {'@': 28})}, 29: {0: (1, {'@': 29}), 1: (1, {'@': 29}), 2: (1, {'@': 29}), 3: (1, {'@': 29})}, 30: {0: (1, {'@': 30}), 1: (1, {'@': 30}), 2: (1, {'@': 30}), 3: (1, {'@': 30})}, 31: {0: (1, {'@': 31}), 1: (1, {'@': 31}), 2: (1, {'@': 31}), 3: (1, {'@': 31})}, 32: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 22), 9: (0, 24), 10: (0, 27), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1), 19: (0, 14)}, 33: {16: (0, 19), 2: (0, 30), 12: (0, 21), 6: (0, 3)}}, 'end_states': {'start': 14}, 'start_states': {'start': 32}}, '__type__': 'LALR_ContextualLexer', 'start': ['start']}, '__type__': 'Lark', 'options': {'transformer': None, 'lexer': 'contextual', 'lexer_callbacks': {}, 'debug': False, 'postlex': None, 'parser': 'lalr', 'cache_grammar': False, 'tree_class': None, 'priority': None, 'start': ['start'], 'keep_all_tokens': False, 'ambiguity': 'auto', 'edit_terminals': None, 'propagate_positions': False, 'maybe_placeholders': False}} +{'rules': [{'@': 23}, {'@': 31}, {'@': 26}, {'@': 13}, {'@': 24}, {'@': 19}, {'@': 14}, {'@': 27}, {'@': 28}, {'@': 16}, {'@': 29}, {'@': 12}, {'@': 25}, {'@': 30}, {'@': 20}, {'@': 22}, {'@': 15}, {'@': 21}, {'@': 17}, {'@': 18}], 'parser': {'lexer_conf': {'tokens': [{'@': 0}, {'@': 1}, {'@': 2}, {'@': 3}, {'@': 4}, {'@': 5}, {'@': 6}, {'@': 7}, {'@': 8}, {'@': 9}, {'@': 10}, {'@': 11}], 'ignore': [u'WS'], 'g_regex_flags': 0, '__type__': 'LexerConf'}, 'parser': {'tokens': {0: 'COMMA', 1: 'RSQB', 2: 'RBRACE', 3: '$END', 4: 'LBRACE', 5: u'FALSE', 6: u'string', 7: u'object', 8: u'NULL', 9: u'SIGNED_NUMBER', 10: u'value', 11: u'array', 12: u'ESCAPED_STRING', 13: u'TRUE', 14: 'LSQB', 15: 'COLON', 16: u'pair', 17: u'__array_star_0', 18: u'__object_star_1', 19: 'start'}, 'states': {0: {0: (1, {'@': 12}), 1: (1, {'@': 12}), 2: (1, {'@': 12}), 3: (1, {'@': 12})}, 1: {1: (0, 29), 4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 6), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 2: {0: (0, 23), 2: (0, 0)}, 3: {15: (0, 12)}, 4: {16: (0, 13), 12: (0, 21), 6: (0, 3)}, 5: {0: (1, {'@': 13}), 1: (1, {'@': 13}), 2: (1, {'@': 13}), 3: (1, {'@': 13})}, 6: {0: (0, 7), 1: (0, 11), 17: (0, 17)}, 7: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 9), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 8: {0: (1, {'@': 14}), 1: (1, {'@': 14}), 2: (1, {'@': 14}), 3: (1, {'@': 14})}, 9: {0: (1, {'@': 15}), 1: (1, {'@': 15})}, 10: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 20), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 11: {0: (1, {'@': 16}), 1: (1, {'@': 16}), 2: (1, {'@': 16}), 3: (1, {'@': 16})}, 12: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 18), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1)}, 13: {0: (1, {'@': 17}), 2: (1, {'@': 17})}, 14: {}, 15: {0: (1, {'@': 18}), 2: (1, {'@': 18})}, 16: {0: (1, {'@': 19}), 1: (1, {'@': 19}), 2: (1, {'@': 19}), 3: (1, {'@': 19})}, 17: {0: (0, 10), 1: (0, 28)}, 18: {0: (1, {'@': 20}), 2: (1, {'@': 20})}, 19: {0: (0, 4), 18: (0, 2), 2: (0, 25)}, 20: {0: (1, {'@': 21}), 1: (1, {'@': 21})}, 21: {0: (1, {'@': 22}), 1: (1, {'@': 22}), 2: (1, {'@': 22}), 3: (1, {'@': 22}), 15: (1, {'@': 22})}, 22: {3: (1, {'@': 23})}, 23: {16: (0, 15), 12: (0, 21), 6: (0, 3)}, 24: {0: (1, {'@': 24}), 1: (1, {'@': 24}), 2: (1, {'@': 24}), 3: (1, {'@': 24})}, 25: {0: (1, {'@': 25}), 1: (1, {'@': 25}), 2: (1, {'@': 25}), 3: (1, {'@': 25})}, 26: {0: (1, {'@': 26}), 1: (1, {'@': 26}), 2: (1, {'@': 26}), 3: (1, {'@': 26})}, 27: {0: (1, {'@': 27}), 1: (1, {'@': 27}), 2: (1, {'@': 27}), 3: (1, {'@': 27})}, 28: {0: (1, {'@': 28}), 1: (1, {'@': 28}), 2: (1, {'@': 28}), 3: (1, {'@': 28})}, 29: {0: (1, {'@': 29}), 1: (1, {'@': 29}), 2: (1, {'@': 29}), 3: (1, {'@': 29})}, 30: {0: (1, {'@': 30}), 1: (1, {'@': 30}), 2: (1, {'@': 30}), 3: (1, {'@': 30})}, 31: {0: (1, {'@': 31}), 1: (1, {'@': 31}), 2: (1, {'@': 31}), 3: (1, {'@': 31})}, 32: {4: (0, 33), 5: (0, 8), 6: (0, 5), 7: (0, 31), 8: (0, 27), 9: (0, 24), 10: (0, 22), 11: (0, 26), 12: (0, 21), 13: (0, 16), 14: (0, 1), 19: (0, 14)}, 33: {16: (0, 19), 2: (0, 30), 12: (0, 21), 6: (0, 3)}}, 'end_states': {'start': 14}, 'start_states': {'start': 32}}, '__type__': 'LALR_ContextualLexer', 'start': ['start']}, '__type__': 'Lark', 'options': {'regex': False, 'transformer': None, 'lexer': 'contextual', 'lexer_callbacks': {}, 'start': ['start'], 'debug': False, 'postlex': None, 'parser': 'lalr', 'tree_class': None, 'priority': None, 'cache': False, 'g_regex_flags': 0, 'keep_all_tokens': False, 'ambiguity': 'auto', 'edit_terminals': None, 'propagate_positions': False, 'maybe_placeholders': False}} ) MEMO = ( -{0: {'priority': 1, 'pattern': {'__type__': 'PatternRE', '_width': [2, 4294967295], 'flags': [], 'value': u'\\".*?(?