Merge branch 'lark-parser:master' into master

4 years ago · 99ced1fbe3
--- a/docs/json_tutorial.md
+++ b/docs/json_tutorial.md
@@ -427,9 +427,9 @@ I measured memory consumption using a little script called [memusg](https://gist
 | Lark - Earley *(with lexer)* | 42s | 4s | 1167M | 608M |
 | Lark - LALR(1) | 8s | 1.53s | 453M | 266M |
 | Lark - LALR(1) tree-less | 4.76s | 1.23s | 70M | 134M |
 | PyParsing ([Parser](http://pyparsing.wikispaces.com/file/view/jsonParser.py)) | 32s | 3.53s | 443M | 225M |
 | funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/funcparserlib/tests/json.py)) | 8.5s | 1.3s | 483M | 293M |
 | Parsimonious ([Parser](https://gist.githubusercontent.com/reclosedev/5222560/raw/5e97cf7eb62c3a3671885ec170577285e891f7d5/parsimonious_json.py)) | ? | 5.7s | ? | 1545M |
 | PyParsing ([Parser](https://github.com/pyparsing/pyparsing/blob/master/examples/jsonParser.py)) | 32s | 3.53s | 443M | 225M |
 | funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/tests/json.py)) | 8.5s | 1.3s | 483M | 293M |
 | Parsimonious ([Parser](https://gist.github.com/reclosedev/5222560)) | ? | 5.7s | ? | 1545M |
 I added a few other parsers for comparison. PyParsing and funcparselib fair pretty well in their memory usage (they don't build a tree), but they can't compete with the run-time speed of LALR(1).
@@ -442,7 +442,7 @@ Once again, shout-out to PyPy for being so effective.
 This is the end of the tutorial. I hoped you liked it and learned a little about Lark.
 To see what else you can do with Lark, check out the [examples](examples).
 To see what else you can do with Lark, check out the [examples](/examples).
 For questions or any other subject, feel free to email me at erezshin at gmail dot com.
--- a/lark-stubs/lark.pyi
+++ b/lark-stubs/lark.pyi
@@ -33,7 +33,7 @@ class LarkOptions:
    regex: bool
    debug: bool
    keep_all_tokens: bool
    propagate_positions: Union[bool, str]
    propagate_positions: Union[bool, Callable]
    maybe_placeholders: bool
    lexer_callbacks: Dict[str, Callable[[Token], Token]]
    cache: Union[bool, str]
@@ -77,7 +77,7 @@ class Lark:
        regex: bool = False,
        debug: bool = False,
        keep_all_tokens: bool = False,
        propagate_positions: Union[bool, str] = False,
        propagate_positions: Union[bool, Callable] = False,
        maybe_placeholders: bool = False,
        lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None,
        cache: Union[bool, str] = False,
--- a/lark/common.py
+++ b/lark/common.py
@@ -1,4 +1,5 @@
 from warnings import warn
 from copy import deepcopy
 from .utils import Serialize
 from .lexer import TerminalDef
@@ -31,6 +32,17 @@ class LexerConf(Serialize):
    def _deserialize(self):
        self.terminals_by_name = {t.name: t for t in self.terminals}
    def __deepcopy__(self, memo=None):
        return type(self)(
            deepcopy(self.terminals, memo),
            self.re_module,
            deepcopy(self.ignore, memo),
            deepcopy(self.postlex, memo),
            deepcopy(self.callbacks, memo),
            deepcopy(self.g_regex_flags, memo),
            deepcopy(self.skip_validation, memo),
            deepcopy(self.use_bytes, memo),
        )
 class ParserConf(Serialize):
--- a/lark/exceptions.py
+++ b/lark/exceptions.py
@@ -129,6 +129,8 @@ class UnexpectedInput(LarkError):
 class UnexpectedEOF(ParseError, UnexpectedInput):
    def __init__(self, expected, state=None, terminals_by_name=None):
        super(UnexpectedEOF, self).__init__()
        self.expected = expected
        self.state = state
        from .lexer import Token
@@ -138,7 +140,6 @@ class UnexpectedEOF(ParseError, UnexpectedInput):
        self.column = -1
        self._terminals_by_name = terminals_by_name
        super(UnexpectedEOF, self).__init__()
    def __str__(self):
        message = "Unexpected end-of-input. "
@@ -149,6 +150,8 @@ class UnexpectedEOF(ParseError, UnexpectedInput):
 class UnexpectedCharacters(LexError, UnexpectedInput):
    def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None,
                 terminals_by_name=None, considered_rules=None):
        super(UnexpectedCharacters, self).__init__()
        # TODO considered_tokens and allowed can be figured out using state
        self.line = line
        self.column = column
@@ -167,7 +170,6 @@ class UnexpectedCharacters(LexError, UnexpectedInput):
            self.char = seq[lex_pos]
        self._context = self.get_context(seq)
        super(UnexpectedCharacters, self).__init__()
    def __str__(self):
        message = "No terminal matches '%s' in the current parser context, at line %d col %d" % (self.char, self.line, self.column)
@@ -190,6 +192,8 @@ class UnexpectedToken(ParseError, UnexpectedInput):
    """
    def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None):
        super(UnexpectedToken, self).__init__()
        # TODO considered_rules and expected can be figured out using state
        self.line = getattr(token, 'line', '?')
        self.column = getattr(token, 'column', '?')
@@ -204,7 +208,6 @@ class UnexpectedToken(ParseError, UnexpectedInput):
        self._terminals_by_name = terminals_by_name
        self.token_history = token_history
        super(UnexpectedToken, self).__init__()
    @property
    def accepts(self):
@@ -236,10 +239,10 @@ class VisitError(LarkError):
    """
    def __init__(self, rule, obj, orig_exc):
        self.obj = obj
        self.orig_exc = orig_exc
        message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
        super(VisitError, self).__init__(message)
        self.obj = obj
        self.orig_exc = orig_exc
 ###}
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -44,7 +44,7 @@ class LarkOptions(Serialize):
            Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster)
    propagate_positions
            Propagates (line, column, end_line, end_column) attributes into all tree branches.
            Accepts ``False``, ``True``, or "ignore_ws", which will trim the whitespace around your trees. 
            Accepts ``False``, ``True``, or a callable, which will filter which nodes to ignore when propagating.
    maybe_placeholders
            When ``True``, the ``[]`` operator returns ``None`` when not matched.
@@ -162,7 +162,7 @@ class LarkOptions(Serialize):
        assert_config(self.parser, ('earley', 'lalr', 'cyk', None))
        if self.parser == 'earley' and self.transformer:
            raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm.'
            raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm. '
                             'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)')
        if o:
@@ -451,11 +451,11 @@ class Lark(Serialize):
            d = f
        else:
            d = pickle.load(f)
        memo = d['memo']
        memo_json = d['memo']
        data = d['data']
        assert memo
        memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
        assert memo_json
        memo = SerializeMemoizer.deserialize(memo_json, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
        options = dict(data['options'])
        if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults):
            raise ConfigurationError("Some options are not allowed when loading a Parser: {}"
@@ -512,11 +512,11 @@ class Lark(Serialize):
            Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...)
        """
        package = FromPackageLoader(package, search_paths)
        full_path, text = package(None, grammar_path)
        package_loader = FromPackageLoader(package, search_paths)
        full_path, text = package_loader(None, grammar_path)
        options.setdefault('source_path', full_path)
        options.setdefault('import_paths', [])
        options['import_paths'].append(package)
        options['import_paths'].append(package_loader)
        return cls(text, **options)
    def __repr__(self):
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -120,33 +120,33 @@ class Token(Str):
    Attributes:
        type: Name of the token (as specified in grammar)
        value: Value of the token (redundant, as ``token.value == token`` will always be true)
        pos_in_stream: The index of the token in the text
        start_pos: The index of the token in the text
        line: The line of the token in the text (starting with 1)
        column: The column of the token in the text (starting with 1)
        end_line: The line where the token ends
        end_column: The next column after the end of the token. For example,
            if the token is a single character with a column value of 4,
            end_column will be 5.
        end_pos: the index where the token ends (basically ``pos_in_stream + len(token)``)
        end_pos: the index where the token ends (basically ``start_pos + len(token)``)
    """
    __slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos')
    def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None, pos_in_stream=None):
        try:
            self = super(Token, cls).__new__(cls, value)
            inst = super(Token, cls).__new__(cls, value)
        except UnicodeDecodeError:
            value = value.decode('latin1')
            self = super(Token, cls).__new__(cls, value)
        self.type = type_
        self.start_pos = start_pos if start_pos is not None else pos_in_stream
        self.value = value
        self.line = line
        self.column = column
        self.end_line = end_line
        self.end_column = end_column
        self.end_pos = end_pos
        return self
            inst = super(Token, cls).__new__(cls, value)
        inst.type = type_
        inst.start_pos = start_pos if start_pos is not None else pos_in_stream
        inst.value = value
        inst.line = line
        inst.column = column
        inst.end_line = end_line
        inst.end_column = end_column
        inst.end_pos = end_pos
        return inst
    @property
    def pos_in_stream(self):
@@ -214,15 +214,13 @@ class LineCounter:
 class UnlessCallback:
    def __init__(self, mres):
        self.mres = mres
    def __init__(self, scanner):
        self.scanner = scanner
    def __call__(self, t):
        for mre, type_from_index in self.mres:
            m = mre.match(t.value)
            if m:
                t.type = type_from_index[m.lastindex]
                break
        res = self.scanner.match(t.value, 0)
        if res:
            _value, t.type = res
        return t
@@ -237,6 +235,11 @@ class CallChain:
        return self.callback2(t) if self.cond(t2) else t2
 def _get_match(re_, regexp, s, flags):
    m = re_.match(regexp, s, flags)
    if m:
        return m.group(0)
 def _create_unless(terminals, g_regex_flags, re_, use_bytes):
    tokens_by_type = classify(terminals, lambda t: type(t.pattern))
    assert len(tokens_by_type) <= 2, tokens_by_type.keys()
@@ -248,40 +251,54 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes):
            if strtok.priority > retok.priority:
                continue
            s = strtok.pattern.value
            m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags)
            if m and m.group(0) == s:
            if s == _get_match(re_, retok.pattern.to_regexp(), s, g_regex_flags):
                unless.append(strtok)
                if strtok.pattern.flags <= retok.pattern.flags:
                    embedded_strs.add(strtok)
        if unless:
            callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))
    terminals = [t for t in terminals if t not in embedded_strs]
    return terminals, callback
 def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes):
    # Python sets an unreasonable group limit (currently 100) in its re module
    # Worse, the only way to know we reached it is by catching an AssertionError!
    # This function recursively tries less and less groups until it's successful.
    postfix = '$' if match_whole else ''
    mres = []
    while terminals:
        pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
        if use_bytes:
            pattern = pattern.encode('latin-1')
        try:
            mre = re_.compile(pattern, g_regex_flags)
        except AssertionError:  # Yes, this is what Python provides us.. :/
            return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes)
            callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))
        mres.append((mre, {i: n for n, i in mre.groupindex.items()}))
        terminals = terminals[max_size:]
    return mres
    new_terminals = [t for t in terminals if t not in embedded_strs]
    return new_terminals, callback
 def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False):
    return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes)
 class Scanner:
    def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False):
        self.terminals = terminals
        self.g_regex_flags = g_regex_flags
        self.re_ = re_
        self.use_bytes = use_bytes
        self.match_whole = match_whole
        self.allowed_types = {t.name for t in self.terminals}
        self._mres = self._build_mres(terminals, len(terminals))
    def _build_mres(self, terminals, max_size):
        # Python sets an unreasonable group limit (currently 100) in its re module
        # Worse, the only way to know we reached it is by catching an AssertionError!
        # This function recursively tries less and less groups until it's successful.
        postfix = '$' if self.match_whole else ''
        mres = []
        while terminals:
            pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
            if self.use_bytes:
                pattern = pattern.encode('latin-1')
            try:
                mre = self.re_.compile(pattern, self.g_regex_flags)
            except AssertionError:  # Yes, this is what Python provides us.. :/
                return self._build_mres(terminals, max_size//2)
            mres.append((mre, {i: n for n, i in mre.groupindex.items()}))
            terminals = terminals[max_size:]
        return mres
    def match(self, text, pos):
        for mre, type_from_index in self._mres:
            m = mre.match(text, pos)
            if m:
                return m.group(0), type_from_index[m.lastindex]
 def _regexp_has_newline(r):
@@ -341,9 +358,9 @@ class TraditionalLexer(Lexer):
        self.use_bytes = conf.use_bytes
        self.terminals_by_name = conf.terminals_by_name
        self._mres = None
        self._scanner = None
    def _build(self):
    def _build_scanner(self):
        terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes)
        assert all(self.callback.values())
@@ -354,19 +371,16 @@ class TraditionalLexer(Lexer):
            else:
                self.callback[type_] = f
        self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes)
        self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes)
    @property
    def mres(self):
        if self._mres is None:
            self._build()
        return self._mres
    def scanner(self):
        if self._scanner is None:
            self._build_scanner()
        return self._scanner
    def match(self, text, pos):
        for mre, type_from_index in self.mres:
            m = mre.match(text, pos)
            if m:
                return m.group(0), type_from_index[m.lastindex]
        return self.scanner.match(text, pos)
    def lex(self, state, parser_state):
        with suppress(EOFError):
@@ -378,7 +392,7 @@ class TraditionalLexer(Lexer):
        while line_ctr.char_pos < len(lex_state.text):
            res = self.match(lex_state.text, line_ctr.char_pos)
            if not res:
                allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types
                allowed = self.scanner.allowed_types - self.ignore_types
                if not allowed:
                    allowed = {"<END-OF-FILE>"}
                raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
--- a/lark/parse_tree_builder.py
+++ b/lark/parse_tree_builder.py
@@ -23,54 +23,59 @@ class ExpandSingleChild:
 class PropagatePositions:
    def __init__(self, node_builder):
    def __init__(self, node_builder, node_filter=None):
        self.node_builder = node_builder
        self.node_filter = node_filter
    def __call__(self, children):
        res = self.node_builder(children)
        # local reference to Tree.meta reduces number of presence checks
        if isinstance(res, Tree):
            res_meta = res.meta
            # Calculate positions while the tree is streaming, according to the rule:
            # - nodes start at the start of their first child's container,
            #   and end at the end of their last child's container.
            # Containers are nodes that take up space in text, but have been inlined in the tree.
            src_meta = self._pp_get_meta(children)
            if src_meta is not None:
                res_meta.line = src_meta.line
                res_meta.column = src_meta.column
                res_meta.start_pos = src_meta.start_pos
                res_meta.empty = False
            res_meta = res.meta
            src_meta = self._pp_get_meta(reversed(children))
            if src_meta is not None:
                res_meta.end_line = src_meta.end_line
                res_meta.end_column = src_meta.end_column
                res_meta.end_pos = src_meta.end_pos
                res_meta.empty = False
            first_meta = self._pp_get_meta(children)
            if first_meta is not None:
                if not hasattr(res_meta, 'line'):
                    # meta was already set, probably because the rule has been inlined (e.g. `?rule`)
                    res_meta.line = getattr(first_meta, 'container_line', first_meta.line)
                    res_meta.column = getattr(first_meta, 'container_column', first_meta.column)
                    res_meta.start_pos = getattr(first_meta, 'container_start_pos', first_meta.start_pos)
                    res_meta.empty = False
                res_meta.container_line = getattr(first_meta, 'container_line', first_meta.line)
                res_meta.container_column = getattr(first_meta, 'container_column', first_meta.column)
            last_meta = self._pp_get_meta(reversed(children))
            if last_meta is not None:
                if not hasattr(res_meta, 'end_line'):
                    res_meta.end_line = getattr(last_meta, 'container_end_line', last_meta.end_line)
                    res_meta.end_column = getattr(last_meta, 'container_end_column', last_meta.end_column)
                    res_meta.end_pos = getattr(last_meta, 'container_end_pos', last_meta.end_pos)
                    res_meta.empty = False
                res_meta.container_end_line = getattr(last_meta, 'container_end_line', last_meta.end_line)
                res_meta.container_end_column = getattr(last_meta, 'container_end_column', last_meta.end_column)
        return res
    def _pp_get_meta(self, children):
        for c in children:
            if self.node_filter is not None and not self.node_filter(c):
                continue
            if isinstance(c, Tree):
                if not c.meta.empty:
                    return c.meta
            elif isinstance(c, Token):
                return c
 class PropagatePositions_IgnoreWs(PropagatePositions):
    def _pp_get_meta(self, children):
        for c in children:
            if isinstance(c, Tree):
                if not c.meta.empty:
                    return c.meta
            elif isinstance(c, Token):
                if c and not c.isspace():     # Disregard whitespace-only tokens
                    return c
 def make_propagate_positions(option):
    if option == "ignore_ws":
        return PropagatePositions_IgnoreWs
    if callable(option):
        return partial(PropagatePositions, node_filter=option)
    elif option is True:
        return PropagatePositions
    elif option is False:
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -39,8 +39,7 @@ class MakeParsingFrontend:
        lexer_conf.lexer_type = self.lexer_type
        return ParsingFrontend(lexer_conf, parser_conf, options)
    @classmethod
    def deserialize(cls, data, memo, lexer_conf, callbacks, options):
    def deserialize(self, data, memo, lexer_conf, callbacks, options):
        parser_conf = ParserConf.deserialize(data['parser_conf'], memo)
        parser = LALR_Parser.deserialize(data['parser'], memo, callbacks, options.debug)
        parser_conf.callbacks = callbacks
@@ -92,26 +91,26 @@ class ParsingFrontend(Serialize):
    def _verify_start(self, start=None):
        if start is None:
            start = self.parser_conf.start
            if len(start) > 1:
                raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start)
            start ,= start
            start_decls = self.parser_conf.start
            if len(start_decls) > 1:
                raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start_decls)
            start ,= start_decls
        elif start not in self.parser_conf.start:
            raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start))
        return start
    def parse(self, text, start=None, on_error=None):
        start = self._verify_start(start)
        chosen_start = self._verify_start(start)
        stream = text if self.skip_lexer else LexerThread(self.lexer, text)
        kw = {} if on_error is None else {'on_error': on_error}
        return self.parser.parse(stream, start, **kw)
        return self.parser.parse(stream, chosen_start, **kw)
    def parse_interactive(self, text=None, start=None):
        start = self._verify_start(start)
        chosen_start = self._verify_start(start)
        if self.parser_conf.parser_type != 'lalr':
            raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ")
        stream = text if self.skip_lexer else LexerThread(self.lexer, text)
        return self.parser.parse_interactive(stream, start)
        return self.parser.parse_interactive(stream, chosen_start)
 def get_frontend(parser, lexer):
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -178,8 +178,8 @@ class _Parser(object):
            for token in state.lexer.lex(state):
                state.feed_token(token)
            token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
            return state.feed_token(token, True)
            end_token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
            return state.feed_token(end_token, True)
        except UnexpectedInput as e:
            try:
                e.interactive_parser = InteractiveParser(self, state, state.lexer)
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -73,14 +73,13 @@ class Serialize(object):
        fields = getattr(self, '__serialize_fields__')
        res = {f: _serialize(getattr(self, f), memo) for f in fields}
        res['__type__'] = type(self).__name__
        postprocess = getattr(self, '_serialize', None)
        if postprocess:
            postprocess(res, memo)
        if hasattr(self, '_serialize'):
            self._serialize(res, memo)
        return res
    @classmethod
    def deserialize(cls, data, memo):
        namespace = getattr(cls, '__serialize_namespace__', {})
        namespace = getattr(cls, '__serialize_namespace__', [])
        namespace = {c.__name__:c for c in namespace}
        fields = getattr(cls, '__serialize_fields__')
@@ -94,9 +93,10 @@ class Serialize(object):
                setattr(inst, f, _deserialize(data[f], namespace, memo))
            except KeyError as e:
                raise KeyError("Cannot find key for class", cls, e)
        postprocess = getattr(inst, '_deserialize', None)
        if postprocess:
            postprocess()
        if hasattr(inst, '_deserialize'):
            inst._deserialize()
        return inst
@@ -241,17 +241,6 @@ except ImportError:
            pass
 try:
    compare = cmp
 except NameError:
    def compare(a, b):
        if a == b:
            return 0
        elif a > b:
            return 1
        return -1
 class Enumerator(Serialize):
    def __init__(self):
        self.enums = {}
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -94,6 +94,26 @@ class TestParsers(unittest.TestCase):
        r = g.parse('a')
        self.assertEqual( r.children[0].meta.line, 1 )
    def test_propagate_positions2(self):
        g = Lark("""start: a
                    a: b
                    ?b: "(" t ")"
                    !t: "t"
                 """, propagate_positions=True)
        start = g.parse("(t)")
        a ,= start.children
        t ,= a.children
        assert t.children[0] == "t"
        assert t.meta.column == 2
        assert t.meta.end_column == 3
        assert start.meta.column == a.meta.column == 1
        assert start.meta.end_column == a.meta.end_column == 4
    def test_expand1(self):
        g = Lark("""start: a