Added `FromPackageLoader` and `open_from_package`

5 years ago · 009cc10590
--- a/lark-stubs/lark.pyi
+++ b/lark-stubs/lark.pyi
@@ -33,6 +33,13 @@ class LarkOptions:
    g_regex_flags: int
    use_bytes: bool
    import_sources: List[Union[str, Callable[[str, str], str]]]
    source: Optional[str]


 class FromPackageLoader:
    def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ...
    
    def __call__(self, base_paths: List[str], grammar_path: str) -> Tuple[str, str]: ...


 class Lark:
@@ -62,6 +69,7 @@ class Lark:
        g_regex_flags: int = ...,
        use_bytes: bool = False,
        import_sources: List[Union[str, Callable[[List[str], str], Tuple[str, str]]]] = ...,
        source: Optional[str],
    ):
        ...

@@ -71,6 +79,10 @@ class Lark:
    @classmethod
    def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str] = None, **options) -> _T:
        ...
    
    @classmethod
    def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_paths: Tuple[str, ...] = ..., **options) -> _T:
        ...

    def lex(self, text: str) -> Iterator[Token]:
        ...
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -5,7 +5,7 @@ from io import open


 from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger
 from .load_grammar import load_grammar
 from .load_grammar import load_grammar, FromPackageLoader
 from .tree import Tree
 from .common import LexerConf, ParserConf

@@ -92,6 +92,8 @@ class LarkOptions(Serialize):
            A callback for editing the terminals before parse.
    import_sources
            A List of either paths or loader functions to specify from where grammars are imported 
    source
            Override the source of from where the grammar was loaded. Usefull for relative imports and unconventional grammar loading

    **=== End Options ===**
    """
@@ -118,6 +120,7 @@ class LarkOptions(Serialize):
        'g_regex_flags': 0,
        'use_bytes': False,
        'import_sources': [],
        'source': None,
    }

    def __init__(self, options_dict):
@@ -193,10 +196,13 @@ class Lark(Serialize):
            re_module = re

        # Some, but not all file-like objects have a 'name' attribute
        try:
            self.source = grammar.name
        except AttributeError:
            self.source = '<string>'
        if self.options.source is None:
            try:
                self.source = grammar.name
            except AttributeError:
                self.source = '<string>'
        else:
            self.source = self.options.source

        # Drain file-like objects to get their contents
        try:
@@ -404,6 +410,28 @@ class Lark(Serialize):
            grammar_filename = os.path.join(basepath, grammar_filename)
        with open(grammar_filename, encoding='utf8') as f:
            return cls(f, **options)
    
    @classmethod
    def open_from_package(cls, package, grammar_path, search_paths=("",), **options):
        """Create an instance of Lark with the grammar loaded from within the package `package`.
        This allows grammar loading from zipapps.
        
        Will also create a `FromPackageLoader` instance and add it to the `import_sources` to simplify importing
        
        ``search_paths`` is passed to `FromPackageLoader`
        
        Example:
            
            Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...)
        """
        package = FromPackageLoader(package, search_paths)
        full_path, text = package([], grammar_path)
        options.setdefault('source', full_path)
        if 'import_sources' in options:
            options['import_sources'].append(package)
        else:
            options['import_sources'] = [package]
        return cls(text, **options)

    def __repr__(self):
        return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source, self.options.parser, self.options.lexer)
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -4,6 +4,7 @@ import os.path
 import sys
 from copy import copy, deepcopy
 from io import open
 import pkgutil

 from .utils import bfs, eval_escaping, Py36, logger, classify_bool
 from .lexer import Token, TerminalDef, PatternStr, PatternRE
@@ -648,35 +649,69 @@ class Grammar:
        return terminals, compiled_rules, self.ignore


 def stdlib_loader(base_paths, grammar_path):
    import pkgutil
    for path in IMPORT_PATHS:
        text = pkgutil.get_data('lark', path + '/' + grammar_path)
        if text is None:
            continue
        return '<stdlib:' + grammar_path + '>', text.decode()
    raise FileNotFoundError()
 class FromPackageLoader(object):
    """
    Provides a simple way of creating custom import loaders that load from packages via ``pkgutil.get_data`` instead of using `open`.
    This allows them to be compatible even from within zip files.
    
    Relative imports are handled, so you can just freely use them.
    
    pkg_name: The name of the package. You can probably provide `__name__` most of the time
    search_paths: All the path that will be search on absolute imports.
    """
    def __init__(self, pkg_name, search_paths=("", )):
        self.pkg_name = pkg_name
        self.search_paths = search_paths
        
    def __repr__(self):
        return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.search_paths)

    def __call__(self, base_paths, grammar_path):
        if len(base_paths) == 0:
            to_try = self.search_paths
        else:
            assert len(base_paths) == 1
            if not base_paths[0].startswith('<%s:' % (self.pkg_name,)):
                # Technically false, but FileNotFound doesn't exist in python2.7, and this message should never reach the end user anyway
                raise IOError()
            base_path = base_paths[0].partition(':')[2]
            if base_path and base_path[0] == '/':
                base_path = base_path[1:]
            to_try = [base_path]
        for path in to_try:
            full_path = os.path.join(path, grammar_path)
            text = None
            with suppress(IOError):
                text = pkgutil.get_data(self.pkg_name, full_path)
            if text is None:
                continue
            return '<%s:/%s>' % (self.pkg_name, full_path), text.decode()
        raise IOError()

 stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS)


 _imported_grammars = {}
 def import_grammar(grammar_path, re_, base_paths=(), import_sources=()):
 def import_grammar(grammar_path, re_, base_paths=[], import_sources=[]):
    if grammar_path not in _imported_grammars:
        import_paths = import_sources + base_paths + [stdlib_loader]
        # import_sources take priority over base_paths since they should handle relative imports and ignore everthing else.
        # Question: should the stdlib_loader really be pushed to the end?
        import_paths = import_sources + base_paths + [stdlib_loader] 
        for source in import_paths:
            if callable(source):
                with suppress(IOError):
            text = None
            with suppress(IOError):
                if callable(source):
                    joined_path, text = source(base_paths, grammar_path)
                    grammar = load_grammar(text, joined_path, re_, import_sources)
                    _imported_grammars[grammar_path] = grammar
                    break
            else:
                with suppress(IOError):
                else:
                    joined_path = os.path.join(source, grammar_path)
                    with open(joined_path, encoding='utf8') as f:
                        text = f.read()
                    grammar = load_grammar(text, joined_path, re_, import_sources)
                    _imported_grammars[grammar_path] = grammar
                    break
            if text is not None:
                # Don't load the grammar from within the suppress statement. Otherwise the underlying error message will be swallowed 
                # and the wrong file will be reported as missing
                grammar = load_grammar(text, joined_path, re_, import_sources) 
                _imported_grammars[grammar_path] = grammar
                break
        else:
            open(grammar_path, encoding='utf8')
            assert False
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -11,6 +11,7 @@ from copy import copy, deepcopy
 from lark.utils import Py36, isascii

 from lark import Token
 from lark.load_grammar import FromPackageLoader

 try:
    from cStringIO import StringIO as cStringIO
@@ -1783,12 +1784,7 @@ def _make_parser_test(LEXER, PARSER):
            self.assertRaises(IOError, _Lark, grammar)

        def test_import_custom_sources(self):
            def custom_loader(base_paths, grammar_path):
                import pkgutil
                text = pkgutil.get_data('tests', 'grammars/' + grammar_path)
                if text is None:
                    raise FileNotFoundError()
                return '<tests.grammars:' + grammar_path + '>', text.decode()
            custom_loader = FromPackageLoader('tests', ('grammars', ))

            grammar = """
            start: startab
@@ -1800,6 +1796,24 @@ def _make_parser_test(LEXER, PARSER):
            self.assertEqual(p.parse('ab'),
                             Tree('start', [Tree('startab', [Tree('ab__expr', [Token('ab__A', 'a'), Token('ab__B', 'b')])])]))

            grammar = """
            start: rule_to_import

            %import test_relative_import_of_nested_grammar__grammar_to_import.rule_to_import
            """
            p = _Lark(grammar, import_sources=[custom_loader])
            x = p.parse('N')
            self.assertEqual(next(x.find_data('rule_to_import')).children, ['N'])
            
            custom_loader2 = FromPackageLoader('tests')
            grammar = """
            %import .test_relative_import (start, WS)
            %ignore WS
            """
            p = _Lark(grammar, import_sources=[custom_loader2])
            x = p.parse('12 capybaras')
            self.assertEqual(x.children, ['12', 'capybaras'])

        @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
        def test_earley_prioritization(self):
            "Tests effect of priority on result"