Merge branch 'MegaIng-custom_import_sources'

преди 3 години · 3aab72fffd
--- a/examples/advanced/python3.lark
+++ b/examples/advanced/python3.lark
@@ -163,22 +163,14 @@ yield_arg: "from" test | testlist

 number: DEC_NUMBER | HEX_NUMBER | BIN_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER
 string: STRING | LONG_STRING
 // Tokens

 NAME: /[a-zA-Z_]\w*/
 COMMENT: /#[^\n]*/
 _NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+

 // Import terminals from standard library (grammars/python.lark)
 %import python (NAME, COMMENT, STRING, LONG_STRING)
 %import python (DEC_NUMBER, HEX_NUMBER, OCT_NUMBER, BIN_NUMBER, FLOAT_NUMBER, IMAG_NUMBER)

 STRING : /[ubf]?r?("(?!"").*?(?<!\\)(\\\\)*?"|'(?!'').*?(?<!\\)(\\\\)*?')/i
 LONG_STRING: /[ubf]?r?(""".*?(?<!\\)(\\\\)*?"""|'''.*?(?<!\\)(\\\\)*?''')/is
 // Other terminals

 DEC_NUMBER: /0|[1-9]\d*/i
 HEX_NUMBER.2: /0x[\da-f]*/i
 OCT_NUMBER.2: /0o[0-7]*/i
 BIN_NUMBER.2 : /0b[0-1]*/i
 FLOAT_NUMBER.2: /((\d+\.\d*|\.\d+)(e[-+]?\d+)?|\d+(e[-+]?\d+))/i
 IMAG_NUMBER.2: /\d+j/i | FLOAT_NUMBER "j"i
 _NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+

 %ignore /[\t \f]+/  // WS
 %ignore /\\[\t \f]*\r?\n/   // LINE_CONT
--- a/lark-stubs/lark.pyi
+++ b/lark-stubs/lark.pyi
@@ -2,7 +2,7 @@

 from typing import (
    TypeVar, Type, List, Dict, IO, Iterator, Callable, Union, Optional,
    Literal, Protocol, Iterable,
    Literal, Protocol, Tuple,  Iterable,
 )
 from .visitors import Transformer
 from .lexer import Token, Lexer, TerminalDef
@@ -34,11 +34,25 @@ class LarkOptions:
    cache: Union[bool, str]
    g_regex_flags: int
    use_bytes: bool
    import_paths: List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]]
    source_path: Optional[str]


 class PackageResource(object):
    pkg_name: str
    path: str
    
    def __init__(self, pkg_name: str, path: str):

 class FromPackageLoader:
    def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ...
    
    def __call__(self, base_path: Union[None, str, PackageResource], grammar_path: str) -> Tuple[PackageResource, str]: ...


 class Lark:
    source: str
    grammar_source: str
    source_path: str
    source_grammar: str
    options: LarkOptions
    lexer: Lexer
    terminals: List[TerminalDef]
@@ -62,6 +76,8 @@ class Lark:
        cache: Union[bool, str] = False,
        g_regex_flags: int = ...,
        use_bytes: bool = False,
        import_paths: List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]] = ...,
        source_path: Optional[str]=None,
    ):
        ...

@@ -71,6 +87,10 @@ class Lark:
    @classmethod
    def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str] = None, **options) -> _T:
        ...
    
    @classmethod
    def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_paths: Tuple[str, ...] = ..., **options) -> _T:
        ...

    def lex(self, text: str) -> Iterator[Token]:
        ...
--- a/lark/grammars/common.lark
+++ b/lark/grammars/common.lark
@@ -1,3 +1,6 @@
 // Basic terminals for common use


 //
 // Numbers
 //
@@ -21,7 +24,7 @@ SIGNED_NUMBER: ["+"|"-"] NUMBER
 // Strings
 //
 _STRING_INNER: /.*?/
 _STRING_ESC_INNER: _STRING_INNER /(?<!\\)(\\\\)*?/ 
 _STRING_ESC_INNER: _STRING_INNER /(?<!\\)(\\\\)*?/

 ESCAPED_STRING : "\"" _STRING_ESC_INNER "\""

@@ -48,3 +51,9 @@ CR : /\r/
 LF : /\n/
 NEWLINE: (CR? LF)+


 // Comments
 SH_COMMENT: /#[^\n]*/
 CPP_COMMENT: /\/\/[^\n]*/
 C_COMMENT: "/*" /.*?/s "*/"
 SQL_COMMENT: /--[^\n]*/
--- a/lark/grammars/python.lark
+++ b/lark/grammars/python.lark
@@ -0,0 +1,19 @@
 // Python terminals

 NAME: /[a-zA-Z_]\w*/
 COMMENT: /#[^\n]*/

 STRING : /[ubf]?r?("(?!"").*?(?<!\\)(\\\\)*?"|'(?!'').*?(?<!\\)(\\\\)*?')/i
 LONG_STRING: /[ubf]?r?(""".*?(?<!\\)(\\\\)*?"""|'''.*?(?<!\\)(\\\\)*?''')/is

 DEC_NUMBER: /0|[1-9]\d*/i
 HEX_NUMBER.2: /0x[\da-f]*/i
 OCT_NUMBER.2: /0o[0-7]*/i
 BIN_NUMBER.2 : /0b[0-1]*/i
 FLOAT_NUMBER.2: /((\d+\.\d*|\.\d+)(e[-+]?\d+)?|\d+(e[-+]?\d+))/i
 IMAG_NUMBER.2: /\d+j/i | FLOAT_NUMBER "j"i


 // Comma-separated list (with an optional trailing comma)
 cs_list{item}: item ("," item)* ","?
 _cs_list{item}: item ("," item)* ","?
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -4,10 +4,10 @@ from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedTok
 import sys, os, pickle, hashlib
 from io import open
 import tempfile

 from warnings import warn

 from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger
 from .load_grammar import load_grammar
 from .load_grammar import load_grammar, FromPackageLoader
 from .tree import Tree
 from .common import LexerConf, ParserConf

@@ -92,6 +92,10 @@ class LarkOptions(Serialize):
            Accept an input of type ``bytes`` instead of ``str`` (Python 3 only).
    edit_terminals
            A callback for editing the terminals before parse.
    import_paths
            A List of either paths or loader functions to specify from where grammars are imported
    source_path
            Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading

    **=== End Options ===**
    """
@@ -126,6 +130,8 @@ class LarkOptions(Serialize):
        'edit_terminals': None,
        'g_regex_flags': 0,
        'use_bytes': False,
        'import_paths': [],
        'source_path': None,
    }

    def __init__(self, options_dict):
@@ -209,10 +215,13 @@ class Lark(Serialize):
            re_module = re

        # Some, but not all file-like objects have a 'name' attribute
        try:
            self.source = grammar.name
        except AttributeError:
            self.source = '<string>'
        if self.options.source_path is None:
            try:
                self.source_path = grammar.name
            except AttributeError:
                self.source_path = '<string>'
        else:
            self.source_path = self.options.source_path

        # Drain file-like objects to get their contents
        try:
@@ -223,7 +232,7 @@ class Lark(Serialize):
            grammar = read()

        assert isinstance(grammar, STRING_TYPE)
        self.grammar_source = grammar
        self.source_grammar = grammar
        if self.options.use_bytes:
            if not isascii(grammar):
                raise ValueError("Grammar must be ascii only, when use_bytes=True")
@@ -285,8 +294,8 @@ class Lark(Serialize):
        if self.options.ambiguity not in _VALID_AMBIGUITY_OPTIONS:
            raise ValueError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS))

        # Parse the grammar file and compose the grammars (TODO)
        self.grammar = load_grammar(grammar, self.source, re_module, self.options.keep_all_tokens)
        # Parse the grammar file and compose the grammars
        self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)

        if self.options.postlex is not None:
            terminals_to_keep = set(self.options.postlex.always_accept)
@@ -395,7 +404,7 @@ class Lark(Serialize):
        options.update(kwargs)
        self.options = LarkOptions.deserialize(options, memo)
        self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
        self.source = '<deserialized>'
        self.source_path = '<deserialized>'
        self._prepare_callbacks()
        self.parser = self.parser_class.deserialize(
            data['parser'],
@@ -430,8 +439,26 @@ class Lark(Serialize):
        with open(grammar_filename, encoding='utf8') as f:
            return cls(f, **options)

    @classmethod
    def open_from_package(cls, package, grammar_path, search_paths=("",), **options):
        """Create an instance of Lark with the grammar loaded from within the package `package`.
        This allows grammar loading from zipapps.

        Imports in the grammar will use the `package` and `search_paths` provided, through `FromPackageLoader`

        Example:

            Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...)
        """
        package = FromPackageLoader(package, search_paths)
        full_path, text = package(None, grammar_path)
        options.setdefault('source_path', full_path)
        options.setdefault('import_paths', [])
        options['import_paths'].append(package)
        return cls(text, **options)

    def __repr__(self):
        return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source, self.options.parser, self.options.lexer)
        return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer)


    def lex(self, text):
@@ -491,5 +518,23 @@ class Lark(Serialize):
                except UnexpectedCharacters as e2:
                    e = e2

    @property
    def source(self):
        warn("Lark.source attribute has been renamed to Lark.source_path", DeprecationWarning)
        return self.source_path

    @source.setter
    def source(self, value):
        self.source_path = value

    @property
    def grammar_source(self):
        warn("Lark.grammar_source attribute has been renamed to Lark.source_grammar", DeprecationWarning)
        return self.source_grammar

    @grammar_source.setter
    def grammar_source(self, value):
        self.source_grammar = value


 ###}
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -4,6 +4,7 @@ import os.path
 import sys
 from copy import copy, deepcopy
 from io import open
 import pkgutil

 from .utils import bfs, eval_escaping, Py36, logger, classify_bool
 from .lexer import Token, TerminalDef, PatternStr, PatternRE
@@ -20,7 +21,7 @@ from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transfo
 inline_args = v_args(inline=True)

 __path__ = os.path.dirname(__file__)
 IMPORT_PATHS = [os.path.join(__path__, 'grammars')]
 IMPORT_PATHS = ['grammars']

 EXT = '.lark'

@@ -648,6 +649,58 @@ class Grammar:
        return terminals, compiled_rules, self.ignore


 class PackageResource(object):
    """
    Represents a path inside a Package. Used by `FromPackageLoader`
    """
    def __init__(self, pkg_name, path):
        self.pkg_name = pkg_name
        self.path = path

    def __str__(self):
        return "<%s: %s>" % (self.pkg_name, self.path)

    def __repr__(self):
        return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.path)

 class FromPackageLoader(object):
    """
    Provides a simple way of creating custom import loaders that load from packages via ``pkgutil.get_data`` instead of using `open`.
    This allows them to be compatible even from within zip files.

    Relative imports are handled, so you can just freely use them.

    pkg_name: The name of the package. You can probably provide `__name__` most of the time
    search_paths: All the path that will be search on absolute imports.
    """
    def __init__(self, pkg_name, search_paths=("", )):
        self.pkg_name = pkg_name
        self.search_paths = search_paths

    def __repr__(self):
        return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.search_paths)

    def __call__(self, base_path, grammar_path):
        if base_path is None:
            to_try = self.search_paths
        else:
            # Check whether or not the importing grammar was loaded by this module.
            if not isinstance(base_path, PackageResource) or base_path.pkg_name != self.pkg_name:
                # Technically false, but FileNotFound doesn't exist in python2.7, and this message should never reach the end user anyway
                raise IOError()
            to_try = [base_path.path]
        for path in to_try:
            full_path = os.path.join(path, grammar_path)
            try:
                text = pkgutil.get_data(self.pkg_name, full_path)
            except IOError:
                continue
            else:
                return PackageResource(self.pkg_name, full_path), text.decode()
        raise IOError()

 stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS)


 _imported_grammars = {}

@@ -787,39 +840,47 @@ class GrammarLoader:
        ('%ignore expects a value', ['%ignore %import\n']),
    ]

    def __init__(self, re_module, global_keep_all_tokens):
    def __init__(self, global_keep_all_tokens):
        terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]

        rules = [options_from_rule(name, None, x) for name, x in  RULES.items()]
        rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, _p, xs, o in rules for i, x in enumerate(xs)]
        callback = ParseTreeBuilder(rules, ST).create_callback()
        lexer_conf = LexerConf(terminals, re_module, ['WS', 'COMMENT'])
        import re
        lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT'])

        parser_conf = ParserConf(rules, callback, ['start'])
        self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)

        self.canonize_tree = CanonizeTree()
        self.re_module = re_module
        self.global_keep_all_tokens = global_keep_all_tokens

    def import_grammar(self, grammar_path, base_paths=[]):
    def import_grammar(self, grammar_path, base_path=None, import_paths=[]):
        if grammar_path not in _imported_grammars:
            import_paths = base_paths + IMPORT_PATHS
            for import_path in import_paths:
                with suppress(IOError):
                    joined_path = os.path.join(import_path, grammar_path)
                    with open(joined_path, encoding='utf8') as f:
                        text = f.read()
                    grammar = self.load_grammar(text, joined_path)
            # import_paths take priority over base_path since they should handle relative imports and ignore everything else.
            to_try = import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader]
            for source in to_try:
                try:
                    if callable(source):
                        joined_path, text = source(base_path, grammar_path)
                    else:
                        joined_path = os.path.join(source, grammar_path)
                        with open(joined_path, encoding='utf8') as f:
                            text = f.read()
                except IOError:
                    continue
                else:
                    grammar = self.load_grammar(text, joined_path, import_paths)
                    _imported_grammars[grammar_path] = grammar
                    break
            else:
                open(grammar_path, encoding='utf8') # Force a file not found error
                # Search failed. Make Python throw a nice error.
                open(grammar_path, encoding='utf8')
                assert False

        return _imported_grammars[grammar_path]

    def load_grammar(self, grammar_text, grammar_name='<?>'):
    def load_grammar(self, grammar_text, grammar_name='<?>', import_paths=[]):
        "Parse grammar_text, verify, and create Grammar object. Display nice messages on error."

        try:
@@ -873,7 +934,7 @@ class GrammarLoader:
                    aliases = {name: arg1 or name}  # Aliases if exist

                if path_node.data == 'import_lib':  # Import from library
                    base_paths = []
                    base_path = None
                else:  # Relative import
                    if grammar_name == '<string>':  # Import relative to script file path if grammar is coded in script
                        try:
@@ -883,16 +944,19 @@ class GrammarLoader:
                    else:
                        base_file = grammar_name  # Import relative to grammar file path if external grammar file
                    if base_file:
                        base_paths = [os.path.split(base_file)[0]]
                        if isinstance(base_file, PackageResource):
                            base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0])
                        else:
                            base_path = os.path.split(base_file)[0]
                    else:
                        base_paths = [os.path.abspath(os.path.curdir)]
                        base_path = os.path.abspath(os.path.curdir)

                try:
                    import_base_paths, import_aliases = imports[dotted_path]
                    assert base_paths == import_base_paths, 'Inconsistent base_paths for %s.' % '.'.join(dotted_path)
                    import_base_path, import_aliases = imports[dotted_path]
                    assert base_path == import_base_path, 'Inconsistent base_path for %s.' % '.'.join(dotted_path)
                    import_aliases.update(aliases)
                except KeyError:
                    imports[dotted_path] = base_paths, aliases
                    imports[dotted_path] = base_path, aliases

            elif stmt.data == 'declare':
                for t in stmt.children:
@@ -901,9 +965,9 @@ class GrammarLoader:
                assert False, stmt

        # import grammars
        for dotted_path, (base_paths, aliases) in imports.items():
        for dotted_path, (base_path, aliases) in imports.items():
            grammar_path = os.path.join(*dotted_path) + EXT
            g = self.import_grammar(grammar_path, base_paths=base_paths)
            g = self.import_grammar(grammar_path, base_path=base_path, import_paths=import_paths)
            new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases)

            term_defs += new_td
@@ -987,5 +1051,5 @@ class GrammarLoader:



 def load_grammar(grammar, source, re_, global_keep_all_tokens):
    return GrammarLoader(re_, global_keep_all_tokens).load_grammar(grammar, source)
 def load_grammar(grammar, source, import_paths, global_keep_all_tokens):
    return GrammarLoader(global_keep_all_tokens).load_grammar(grammar, source, import_paths)
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -11,6 +11,7 @@ from copy import copy, deepcopy
 from lark.utils import Py36, isascii

 from lark import Token
 from lark.load_grammar import FromPackageLoader

 try:
    from cStringIO import StringIO as cStringIO
@@ -1805,6 +1806,37 @@ def _make_parser_test(LEXER, PARSER):
            tree = parser.parse(test_file)
            self.assertEqual(tree.children, [Token('B', 'A')])

        def test_import_custom_sources(self):
            custom_loader = FromPackageLoader('tests', ('grammars', ))

            grammar = """
            start: startab

            %import ab.startab
            """

            p = _Lark(grammar, import_paths=[custom_loader])
            self.assertEqual(p.parse('ab'),
                             Tree('start', [Tree('startab', [Tree('ab__expr', [Token('ab__A', 'a'), Token('ab__B', 'b')])])]))

            grammar = """
            start: rule_to_import

            %import test_relative_import_of_nested_grammar__grammar_to_import.rule_to_import
            """
            p = _Lark(grammar, import_paths=[custom_loader])
            x = p.parse('N')
            self.assertEqual(next(x.find_data('rule_to_import')).children, ['N'])

            custom_loader2 = FromPackageLoader('tests')
            grammar = """
            %import .test_relative_import (start, WS)
            %ignore WS
            """
            p = _Lark(grammar, import_paths=[custom_loader2])
            x = p.parse('12 capybaras')
            self.assertEqual(x.children, ['12', 'capybaras'])

        @unittest.skipIf(PARSER == 'cyk', "Doesn't work for CYK")
        def test_prioritization(self):
            "Tests effect of priority on result"