Merge branch 'master' into template_rules

5 years ago · 9f04c7f754
--- a/docs/classes.md
+++ b/docs/classes.md
@@ -8,41 +8,86 @@ This page details the important classes in Lark.

 The Lark class is the main interface for the library. It's mostly a thin wrapper for the many different parsers, and for the tree constructor.

 #### \_\_init\_\_(self, grammar, **options)
 #### \_\_init\_\_(self, grammar_string, **options)

 The Lark class accepts a grammar string or file object, and keyword options:
 Creates an instance of Lark with the given grammar

 * **start** - A list of the rules in the grammar that begin the parse (Default: `["start"]`)
 #### open(cls, grammar_filename, rel_to=None, **options)

 * **parser** - Decides which parser engine to use, "earley", "lalr" or "cyk". (Default: `"earley"`)
 Creates an instance of Lark with the grammar given by its filename

 * **lexer** - Overrides default lexer, depending on parser.
 If rel_to is provided, the function will find the grammar filename in relation to it.

 * **transformer** - Applies the provided transformer instead of building a parse tree (only allowed with parser="lalr")
 Example:

 * **postlex** - Lexer post-processing (Default: `None`. only works when lexer is "standard" or "contextual")
 ```python
    >>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr")
    Lark(...)
 ```

 * **ambiguity** (only relevant for earley and cyk)
 #### parse(self, text)

     * "explicit" - Return all derivations inside an "_ambig" data node.
 Return a complete parse tree for the text (of type Tree)

     * "resolve" - Let the parser choose the best derivation (greedy for tokens, non-greedy for rules. Default)
 If a transformer is supplied to `__init__`, returns whatever is the result of the transformation.

 * **debug** - Display warnings (such as Shift-Reduce warnings for LALR)

 * **keep_all_tokens** - Don't throw away any terminals from the tree (Default=`False`)
 #### save(self, f) / load(cls, f)

 * **propagate_positions** - Propagate line/column count to tree nodes, at the cost of performance (default=`False`)
 Useful for caching and multiprocessing.

 * **maybe_placeholders** - When True, the `[]` operator returns `None` when not matched. When `False`,  `[]` behaves like the `?` operator, and return no value at all, which may be a little faster (default=`False`)
 `save` saves the instance into the given file object

 * **lexer_callbacks** - A dictionary of callbacks of type f(Token) -> Token, used to interface with the lexer Token generation. Only works with the standard and contextual lexers. See [Recipes](recipes.md) for more information.
 `load` loads an instance from the given file object

 #### parse(self, text)
 ####

 Return a complete parse tree for the text (of type Tree)

 If a transformer is supplied to `__init__`, returns whatever is the result of the transformation.
 ### Lark Options
 #### General options

 **start** - The start symbol. Either a string, or a list of strings for multiple possible starts (Default: "start")

 **debug** - Display debug information, such as warnings (default: False)

 **transformer** - Applies the transformer to every parse tree (equivlent to applying it after the parse, but faster)

 **propagate_positions** - Propagates (line, column, end_line, end_column) attributes into all tree branches.

 **maybe_placeholders** -
 - When True, the `[]` operator returns `None` when not matched.
 - When `False`,  `[]` behaves like the `?` operator, and returns no value at all.
 - (default=`False`. Recommended to set to `True`)

 **g_regex_flags** - Flags that are applied to all terminals (both regex and strings)

 **keep_all_tokens** - Prevent the tree builder from automagically removing "punctuation" tokens (default: False)

 **cache_grammar** - Cache the Lark grammar (Default: False)

 #### Algorithm

 **parser** - Decides which parser engine to use, "earley" or "lalr". (Default: "earley")
            (there is also a "cyk" option for legacy)

 **lexer** - Decides whether or not to use a lexer stage

 - "auto" (default): Choose for me based on the parser
 - "standard": Use a standard lexer
 - "contextual": Stronger lexer (only works with parser="lalr")
 - "dynamic": Flexible and powerful (only with parser="earley")
 - "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible. (only with parser="earley")

 **ambiguity** - Decides how to handle ambiguity in the parse. Only relevant if parser="earley"
 - "resolve": The parser will automatically choose the simplest derivation (it chooses consistently: greedy for tokens, non-greedy for rules)
 - "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest).

 #### Domain Specific

 - **postlex** - Lexer post-processing (Default: None) Only works with the standard and contextual lexers.
 - **priority** - How priorities should be evaluated - auto, none, normal, invert (Default: auto)
 - **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution.
 - **edit_terminals** - A callback

 ----

--- a/lark/common.py
+++ b/lark/common.py
@@ -4,14 +4,15 @@ from .lexer import TerminalDef
 ###{standalone

 class LexerConf(Serialize):
    __serialize_fields__ = 'tokens', 'ignore'
    __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags'
    __serialize_namespace__ = TerminalDef,

    def __init__(self, tokens, ignore=(), postlex=None, callbacks=None):
    def __init__(self, tokens, ignore=(), postlex=None, callbacks=None, g_regex_flags=0):
        self.tokens = tokens
        self.ignore = ignore
        self.postlex = postlex
        self.callbacks = callbacks or {}
        self.g_regex_flags = g_regex_flags

    def _deserialize(self):
        self.callbacks = {} # TODO
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -22,32 +22,56 @@ class LarkOptions(Serialize):

    """
    OPTIONS_DOC = """
        parser - Decides which parser engine to use, "earley" or "lalr". (Default: "earley")
                 Note: "lalr" requires a lexer

        lexer - Decides whether or not to use a lexer stage
            "standard": Use a standard lexer
            "contextual": Stronger lexer (only works with parser="lalr")
            "dynamic": Flexible and powerful (only with parser="earley")
            "dynamic_complete": Same as dynamic, but tries *every* variation
                                of tokenizing possible. (only with parser="earley")
            "auto" (default): Choose for me based on grammar and parser

        ambiguity - Decides how to handle ambiguity in the parse. Only relevant if parser="earley"
            "resolve": The parser will automatically choose the simplest derivation
                       (it chooses consistently: greedy for tokens, non-greedy for rules)
            "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest).

        transformer - Applies the transformer to every parse tree
        debug - Affects verbosity (default: False)
        keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False)
        cache_grammar - Cache the Lark grammar (Default: False)
        postlex - Lexer post-processing (Default: None) Only works with the standard and contextual lexers.
        start - The start symbol, either a string, or a list of strings for multiple possible starts (Default: "start")
        priority - How priorities should be evaluated - auto, none, normal, invert (Default: auto)
        propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches.
        lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution.
        maybe_placeholders - Experimental feature. Instead of omitting optional rules (i.e. rule?), replace them with None
 # General

    start - The start symbol. Either a string, or a list of strings for
            multiple possible starts (Default: "start")
    debug - Display debug information, such as warnings (default: False)
    transformer - Applies the transformer to every parse tree (equivlent to
                  applying it after the parse, but faster)
    propagate_positions - Propagates (line, column, end_line, end_column)
                          attributes into all tree branches.
    maybe_placeholders - When True, the `[]` operator returns `None` when not matched.
                         When `False`,  `[]` behaves like the `?` operator,
                             and returns no value at all.
                         (default=`False`. Recommended to set to `True`)
    cache_grammar - Cache the Lark grammar (Default: False)
    g_regex_flags - Flags that are applied to all terminals
                    (both regex and strings)
    keep_all_tokens - Prevent the tree builder from automagically
                      removing "punctuation" tokens (default: False)

 # Algorithm

    parser - Decides which parser engine to use
             Accepts "earley" or "lalr". (Default: "earley")
             (there is also a "cyk" option for legacy)

    lexer - Decides whether or not to use a lexer stage
        "auto" (default): Choose for me based on the parser
        "standard": Use a standard lexer
        "contextual": Stronger lexer (only works with parser="lalr")
        "dynamic": Flexible and powerful (only with parser="earley")
        "dynamic_complete": Same as dynamic, but tries *every* variation
                            of tokenizing possible.

    ambiguity - Decides how to handle ambiguity in the parse.
                Only relevant if parser="earley"
        "resolve": The parser will automatically choose the simplest
                    derivation (it chooses consistently: greedy for
                    tokens, non-greedy for rules)
        "explicit": The parser will return all derivations wrapped
                    in "_ambig" tree nodes (i.e. a forest).

 # Domain Specific

    postlex - Lexer post-processing (Default: None) Only works with the
                standard and contextual lexers.
    priority - How priorities should be evaluated - auto, none, normal,
                invert (Default: auto)
    lexer_callbacks - Dictionary of callbacks for the lexer. May alter
                        tokens during lexing. Use with caution.
    edit_terminals - A callback
    """
    if __doc__:
        __doc__ += OPTIONS_DOC
@@ -68,6 +92,7 @@ class LarkOptions(Serialize):
        'lexer_callbacks': {},
        'maybe_placeholders': False,
        'edit_terminals': None,
        'g_regex_flags': 0,
    }

    def __init__(self, options_dict):
@@ -209,7 +234,7 @@ class Lark(Serialize):
                if hasattr(t, term.name):
                    lexer_callbacks[term.name] = getattr(t, term.name)

        self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, lexer_callbacks)
        self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags)

        if self.options.parser:
            self.parser = self._build_parser()
@@ -217,12 +242,12 @@ class Lark(Serialize):
            self.lexer = self._build_lexer()

    if __init__.__doc__:
        __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC
        __init__.__doc__ += "\nOptions:\n" + LarkOptions.OPTIONS_DOC

    __serialize_fields__ = 'parser', 'rules', 'options'

    def _build_lexer(self):
        return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks)
        return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags)

    def _prepare_callbacks(self):
        self.parser_class = get_frontend(self.options.parser, self.options.lexer)
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -230,7 +230,7 @@ class CallChain:



 def _create_unless(terminals):
 def _create_unless(terminals, g_regex_flags):
    tokens_by_type = classify(terminals, lambda t: type(t.pattern))
    assert len(tokens_by_type) <= 2, tokens_by_type.keys()
    embedded_strs = set()
@@ -241,19 +241,19 @@ def _create_unless(terminals):
            if strtok.priority > retok.priority:
                continue
            s = strtok.pattern.value
            m = re.match(retok.pattern.to_regexp(), s)
            m = re.match(retok.pattern.to_regexp(), s, g_regex_flags)
            if m and m.group(0) == s:
                unless.append(strtok)
                if strtok.pattern.flags <= retok.pattern.flags:
                    embedded_strs.add(strtok)
        if unless:
            callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True))
            callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, match_whole=True))

    terminals = [t for t in terminals if t not in embedded_strs]
    return terminals, callback


 def _build_mres(terminals, max_size, match_whole):
 def _build_mres(terminals, max_size, g_regex_flags, match_whole):
    # Python sets an unreasonable group limit (currently 100) in its re module
    # Worse, the only way to know we reached it is by catching an AssertionError!
    # This function recursively tries less and less groups until it's successful.
@@ -261,17 +261,17 @@ def _build_mres(terminals, max_size, match_whole):
    mres = []
    while terminals:
        try:
            mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]))
            mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags)
        except AssertionError:  # Yes, this is what Python provides us.. :/
            return _build_mres(terminals, max_size//2, match_whole)
            return _build_mres(terminals, max_size//2, g_regex_flags, match_whole)

        # terms_from_name = {t.name: t for t in terminals[:max_size]}
        mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
        terminals = terminals[max_size:]
    return mres

 def build_mres(terminals, match_whole=False):
    return _build_mres(terminals, len(terminals), match_whole)
 def build_mres(terminals, g_regex_flags, match_whole=False):
    return _build_mres(terminals, len(terminals), g_regex_flags, match_whole)

 def _regexp_has_newline(r):
    r"""Expressions that may indicate newlines in a regexp:
@@ -294,7 +294,7 @@ class Lexer(object):

 class TraditionalLexer(Lexer):

    def __init__(self, terminals, ignore=(), user_callbacks={}):
    def __init__(self, terminals, ignore=(), user_callbacks={}, g_regex_flags=0):
        assert all(isinstance(t, TerminalDef) for t in terminals), terminals

        terminals = list(terminals)
@@ -302,7 +302,7 @@ class TraditionalLexer(Lexer):
        # Sanitization
        for t in terminals:
            try:
                re.compile(t.pattern.to_regexp())
                re.compile(t.pattern.to_regexp(), g_regex_flags)
            except re.error:
                raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))

@@ -318,10 +318,10 @@ class TraditionalLexer(Lexer):
        terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
        self.terminals = terminals
        self.user_callbacks = user_callbacks
        self.build()
        self.build(g_regex_flags)

    def build(self):
        terminals, self.callback = _create_unless(self.terminals)
    def build(self, g_regex_flags=0):
        terminals, self.callback = _create_unless(self.terminals, g_regex_flags)
        assert all(self.callback.values())

        for type_, f in self.user_callbacks.items():
@@ -331,7 +331,7 @@ class TraditionalLexer(Lexer):
            else:
                self.callback[type_] = f

        self.mres = build_mres(terminals)
        self.mres = build_mres(terminals, g_regex_flags)

    def match(self, stream, pos):
        for mre, type_from_index in self.mres:
@@ -347,7 +347,7 @@ class TraditionalLexer(Lexer):

 class ContextualLexer(Lexer):

    def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}):
    def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0):
        tokens_by_name = {}
        for t in terminals:
            assert t.name not in tokens_by_name, t
@@ -362,12 +362,12 @@ class ContextualLexer(Lexer):
            except KeyError:
                accepts = set(accepts) | set(ignore) | set(always_accept)
                state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
                lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks)
                lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
                lexer_by_tokens[key] = lexer

            self.lexers[state] = lexer

        self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks)
        self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)

    def lex(self, stream, get_parser_state):
        parser_state = get_parser_state()
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -88,7 +88,7 @@ class WithLexer(_ParserFrontend):
        return self._parse(token_stream, start)

    def init_traditional_lexer(self):
        self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks)
        self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags)

 class LALR_WithLexer(WithLexer):
    def __init__(self, lexer_conf, parser_conf, options=None):
@@ -112,7 +112,8 @@ class LALR_ContextualLexer(LALR_WithLexer):
        self.lexer = ContextualLexer(self.lexer_conf.tokens, states,
                                     ignore=self.lexer_conf.ignore,
                                     always_accept=always_accept,
                                     user_callbacks=self.lexer_conf.callbacks)
                                     user_callbacks=self.lexer_conf.callbacks,
                                     g_regex_flags=self.lexer_conf.g_regex_flags)


    def parse(self, text, start=None):
@@ -187,7 +188,7 @@ class XEarley(_ParserFrontend):
                if width == 0:
                    raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t)

            self.regexps[t.name] = re.compile(regexp)
            self.regexps[t.name] = re.compile(regexp, lexer_conf.g_regex_flags)

    def parse(self, text, start):
        return self._parse(text, start)
--- a/lark_stubs/lark.pyi
+++ b/lark_stubs/lark.pyi
@@ -34,6 +34,7 @@ class LarkOptions:
    maybe_placeholders: bool
    lexer_callbacks: Dict[str, Callable[[Token], Token]]
    cache_grammar: bool
    g_regex_flags: int


 class Lark:
@@ -56,7 +57,8 @@ class Lark:
        keep_all_tokens: bool = False,
        propagate_positions: bool = False,
        maybe_placeholders: bool = False,
        lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None
        lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None,
        g_regex_flags: int = ...
    ):
        ...

--- a/lark_stubs/lexer.pyi
+++ b/lark_stubs/lexer.pyi
@@ -112,7 +112,8 @@ class TraditionalLexer(Lexer):
        self,
        terminals: Collection[TerminalDef],
        ignore: Collection[str] = ...,
        user_callbacks: Dict[str, _Callback] = ...
        user_callbacks: Dict[str, _Callback] = ...,
        g_regex_flags: int = ...
    ):
        ...

@@ -136,7 +137,8 @@ class ContextualLexer(Lexer):
        states: Dict[str, Collection[str]],
        ignore: Collection[str] = ...,
        always_accept: Collection[str] = ...,
        user_callbacks: Dict[str, _Callback] = ...
        user_callbacks: Dict[str, _Callback] = ...,
        g_regex_flags: int = ...
    ):
        ...

--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import

 import re
 import unittest
 import logging
 import os
@@ -538,7 +539,7 @@ class CustomLexer(Lexer):
    so it uses the traditionalparser as implementation without custom lexing behaviour.
    """
    def __init__(self, lexer_conf):
        self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks)
        self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags)
    def lex(self, *args, **kwargs):
        return self.lexer.lex(*args, **kwargs)

@@ -845,7 +846,16 @@ def _make_parser_test(LEXER, PARSER):

            x = g.parse("starts")
            self.assertSequenceEqual(x.children, ['starts'])

        
        def test_g_regex_flags(self):
            g = _Lark("""
                    start: "a" /b+/ C
                    C: "C" | D
                    D: "D" E
                    E: "e"
                    """, g_regex_flags=re.I)
            x1 = g.parse("ABBc")
            x2 = g.parse("abdE")

        # def test_string_priority(self):
        #     g = _Lark("""start: (A | /a?bb/)+
@@ -1715,6 +1725,7 @@ def _make_parser_test(LEXER, PARSER):

    _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
    _TestParser.__name__ = _NAME
    _TestParser.__qualname__ = "tests.test_parser." + _NAME
    globals()[_NAME] = _TestParser

 # Note: You still have to import them in __main__ for the tests to run