diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index f538dbb..79437ff 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -2,7 +2,7 @@ from typing import ( TypeVar, Type, List, Dict, IO, Iterator, Callable, Union, Optional, - Literal, Protocol, Iterable, + Literal, Protocol, Tuple, Iterable, ) from .visitors import Transformer from .lexer import Token, Lexer, TerminalDef @@ -34,11 +34,25 @@ class LarkOptions: cache: Union[bool, str] g_regex_flags: int use_bytes: bool + import_paths: List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]] + source_path: Optional[str] + + +class PackageResource(object): + pkg_name: str + path: str + + def __init__(self, pkg_name: str, path: str): + +class FromPackageLoader: + def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ... + + def __call__(self, base_path: Union[None, str, PackageResource], grammar_path: str) -> Tuple[PackageResource, str]: ... class Lark: - source: str - grammar_source: str + source_path: str + source_grammar: str options: LarkOptions lexer: Lexer terminals: List[TerminalDef] @@ -62,6 +76,8 @@ class Lark: cache: Union[bool, str] = False, g_regex_flags: int = ..., use_bytes: bool = False, + import_paths: List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]] = ..., + source_path: Optional[str]=None, ): ... @@ -71,6 +87,10 @@ class Lark: @classmethod def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str] = None, **options) -> _T: ... + + @classmethod + def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_paths: Tuple[str, ...] = ..., **options) -> _T: + ... def lex(self, text: str) -> Iterator[Token]: ... diff --git a/lark/lark.py b/lark/lark.py index e84befc..642e408 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -4,10 +4,10 @@ from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedTok import sys, os, pickle, hashlib from io import open import tempfile - +from warnings import warn from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger -from .load_grammar import load_grammar +from .load_grammar import load_grammar, FromPackageLoader from .tree import Tree from .common import LexerConf, ParserConf @@ -92,6 +92,10 @@ class LarkOptions(Serialize): Accept an input of type ``bytes`` instead of ``str`` (Python 3 only). edit_terminals A callback for editing the terminals before parse. + import_paths + A List of either paths or loader functions to specify from where grammars are imported + source_path + Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading **=== End Options ===** """ @@ -126,6 +130,8 @@ class LarkOptions(Serialize): 'edit_terminals': None, 'g_regex_flags': 0, 'use_bytes': False, + 'import_paths': [], + 'source_path': None, } def __init__(self, options_dict): @@ -209,10 +215,13 @@ class Lark(Serialize): re_module = re # Some, but not all file-like objects have a 'name' attribute - try: - self.source = grammar.name - except AttributeError: - self.source = '' + if self.options.source_path is None: + try: + self.source_path = grammar.name + except AttributeError: + self.source_path = '' + else: + self.source_path = self.options.source_path # Drain file-like objects to get their contents try: @@ -223,7 +232,7 @@ class Lark(Serialize): grammar = read() assert isinstance(grammar, STRING_TYPE) - self.grammar_source = grammar + self.source_grammar = grammar if self.options.use_bytes: if not isascii(grammar): raise ValueError("Grammar must be ascii only, when use_bytes=True") @@ -286,7 +295,7 @@ class Lark(Serialize): raise ValueError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS)) # Parse the grammar file and compose the grammars (TODO) - self.grammar = load_grammar(grammar, self.source, re_module, self.options.keep_all_tokens) + self.grammar = load_grammar(grammar, self.source, self.options.import_paths, self.options.keep_all_tokens) if self.options.postlex is not None: terminals_to_keep = set(self.options.postlex.always_accept) @@ -395,7 +404,7 @@ class Lark(Serialize): options.update(kwargs) self.options = LarkOptions.deserialize(options, memo) self.rules = [Rule.deserialize(r, memo) for r in data['rules']] - self.source = '' + self.source_path = '' self._prepare_callbacks() self.parser = self.parser_class.deserialize( data['parser'], @@ -430,8 +439,26 @@ class Lark(Serialize): with open(grammar_filename, encoding='utf8') as f: return cls(f, **options) + @classmethod + def open_from_package(cls, package, grammar_path, search_paths=("",), **options): + """Create an instance of Lark with the grammar loaded from within the package `package`. + This allows grammar loading from zipapps. + + Imports in the grammar will use the `package` and `search_paths` provided, through `FromPackageLoader` + + Example: + + Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...) + """ + package = FromPackageLoader(package, search_paths) + full_path, text = package(None, grammar_path) + options.setdefault('source_path', full_path) + options.setdefault('import_paths', []) + options['import_paths'].append(package) + return cls(text, **options) + def __repr__(self): - return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source, self.options.parser, self.options.lexer) + return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer) def lex(self, text): @@ -491,5 +518,23 @@ class Lark(Serialize): except UnexpectedCharacters as e2: e = e2 + @property + def source(self): + warn("Lark.source attribute has been renamed to Lark.source_path", DeprecationWarning) + return self.source_path + + @source.setter + def source(self, value): + self.source_path = value + + @property + def grammar_source(self): + warn("Lark.grammar_source attribute has been renamed to Lark.source_grammar", DeprecationWarning) + return self.source_grammar + + @grammar_source.setter + def grammar_source(self, value): + self.source_grammar = value + ###} diff --git a/lark/load_grammar.py b/lark/load_grammar.py index d039638..eb0273c 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -4,6 +4,7 @@ import os.path import sys from copy import copy, deepcopy from io import open +import pkgutil from .utils import bfs, eval_escaping, Py36, logger, classify_bool from .lexer import Token, TerminalDef, PatternStr, PatternRE @@ -20,7 +21,7 @@ from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transfo inline_args = v_args(inline=True) __path__ = os.path.dirname(__file__) -IMPORT_PATHS = [os.path.join(__path__, 'grammars')] +IMPORT_PATHS = ['grammars'] EXT = '.lark' @@ -648,6 +649,58 @@ class Grammar: return terminals, compiled_rules, self.ignore +class PackageResource(object): + """ + Represents a path inside a Package. Used by `FromPackageLoader` + """ + def __init__(self, pkg_name, path): + self.pkg_name = pkg_name + self.path = path + + def __str__(self): + return "<%s: %s>" % (self.pkg_name, self.path) + + def __repr__(self): + return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.path) + +class FromPackageLoader(object): + """ + Provides a simple way of creating custom import loaders that load from packages via ``pkgutil.get_data`` instead of using `open`. + This allows them to be compatible even from within zip files. + + Relative imports are handled, so you can just freely use them. + + pkg_name: The name of the package. You can probably provide `__name__` most of the time + search_paths: All the path that will be search on absolute imports. + """ + def __init__(self, pkg_name, search_paths=("", )): + self.pkg_name = pkg_name + self.search_paths = search_paths + + def __repr__(self): + return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.search_paths) + + def __call__(self, base_path, grammar_path): + if base_path is None: + to_try = self.search_paths + else: + # Check whether or not the importing grammar was loaded by this module. + if not isinstance(base_path, PackageResource) or base_path.pkg_name != self.pkg_name: + # Technically false, but FileNotFound doesn't exist in python2.7, and this message should never reach the end user anyway + raise IOError() + to_try = [base_path.path] + for path in to_try: + full_path = os.path.join(path, grammar_path) + try: + text = pkgutil.get_data(self.pkg_name, full_path) + except IOError: + continue + else: + return PackageResource(self.pkg_name, full_path), text.decode() + raise IOError() + +stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS) + _imported_grammars = {} @@ -787,39 +840,47 @@ class GrammarLoader: ('%ignore expects a value', ['%ignore %import\n']), ] - def __init__(self, re_module, global_keep_all_tokens): + def __init__(self, global_keep_all_tokens): terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] rules = [options_from_rule(name, None, x) for name, x in RULES.items()] rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, _p, xs, o in rules for i, x in enumerate(xs)] callback = ParseTreeBuilder(rules, ST).create_callback() - lexer_conf = LexerConf(terminals, re_module, ['WS', 'COMMENT']) + import re + lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT']) parser_conf = ParserConf(rules, callback, ['start']) self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf) self.canonize_tree = CanonizeTree() - self.re_module = re_module self.global_keep_all_tokens = global_keep_all_tokens - def import_grammar(self, grammar_path, base_paths=[]): + def import_grammar(self, grammar_path, base_path=None, import_paths=[]): if grammar_path not in _imported_grammars: - import_paths = base_paths + IMPORT_PATHS - for import_path in import_paths: - with suppress(IOError): - joined_path = os.path.join(import_path, grammar_path) - with open(joined_path, encoding='utf8') as f: - text = f.read() - grammar = self.load_grammar(text, joined_path) + # import_paths take priority over base_path since they should handle relative imports and ignore everything else. + to_try = import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader] + for source in to_try: + try: + if callable(source): + joined_path, text = source(base_path, grammar_path) + else: + joined_path = os.path.join(source, grammar_path) + with open(joined_path, encoding='utf8') as f: + text = f.read() + except IOError: + continue + else: + grammar = self.load_grammar(text, joined_path, import_paths) _imported_grammars[grammar_path] = grammar break else: - open(grammar_path, encoding='utf8') # Force a file not found error + # Search failed. Make Python throw a nice error. + open(grammar_path, encoding='utf8') assert False return _imported_grammars[grammar_path] - def load_grammar(self, grammar_text, grammar_name=''): + def load_grammar(self, grammar_text, grammar_name='', import_paths=[]): "Parse grammar_text, verify, and create Grammar object. Display nice messages on error." try: @@ -873,7 +934,7 @@ class GrammarLoader: aliases = {name: arg1 or name} # Aliases if exist if path_node.data == 'import_lib': # Import from library - base_paths = [] + base_path = None else: # Relative import if grammar_name == '': # Import relative to script file path if grammar is coded in script try: @@ -883,16 +944,19 @@ class GrammarLoader: else: base_file = grammar_name # Import relative to grammar file path if external grammar file if base_file: - base_paths = [os.path.split(base_file)[0]] + if isinstance(base_file, PackageResource): + base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0]) + else: + base_path = os.path.split(base_file)[0] else: - base_paths = [os.path.abspath(os.path.curdir)] + base_path = os.path.abspath(os.path.curdir) try: - import_base_paths, import_aliases = imports[dotted_path] - assert base_paths == import_base_paths, 'Inconsistent base_paths for %s.' % '.'.join(dotted_path) + import_base_path, import_aliases = imports[dotted_path] + assert base_path == import_base_path, 'Inconsistent base_path for %s.' % '.'.join(dotted_path) import_aliases.update(aliases) except KeyError: - imports[dotted_path] = base_paths, aliases + imports[dotted_path] = base_path, aliases elif stmt.data == 'declare': for t in stmt.children: @@ -901,9 +965,9 @@ class GrammarLoader: assert False, stmt # import grammars - for dotted_path, (base_paths, aliases) in imports.items(): + for dotted_path, (base_path, aliases) in imports.items(): grammar_path = os.path.join(*dotted_path) + EXT - g = self.import_grammar(grammar_path, base_paths=base_paths) + g = self.import_grammar(grammar_path, base_path=base_path, import_paths=import_paths) new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases) term_defs += new_td @@ -987,5 +1051,5 @@ class GrammarLoader: -def load_grammar(grammar, source, re_, global_keep_all_tokens): - return GrammarLoader(re_, global_keep_all_tokens).load_grammar(grammar, source) +def load_grammar(grammar, source, import_paths, global_keep_all_tokens): + return GrammarLoader(global_keep_all_tokens).load_grammar(grammar, source, import_paths) diff --git a/tests/test_parser.py b/tests/test_parser.py index 38399cf..32aa4fc 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -11,6 +11,7 @@ from copy import copy, deepcopy from lark.utils import Py36, isascii from lark import Token +from lark.load_grammar import FromPackageLoader try: from cStringIO import StringIO as cStringIO @@ -1805,6 +1806,37 @@ def _make_parser_test(LEXER, PARSER): tree = parser.parse(test_file) self.assertEqual(tree.children, [Token('B', 'A')]) + def test_import_custom_sources(self): + custom_loader = FromPackageLoader('tests', ('grammars', )) + + grammar = """ + start: startab + + %import ab.startab + """ + + p = _Lark(grammar, import_paths=[custom_loader]) + self.assertEqual(p.parse('ab'), + Tree('start', [Tree('startab', [Tree('ab__expr', [Token('ab__A', 'a'), Token('ab__B', 'b')])])])) + + grammar = """ + start: rule_to_import + + %import test_relative_import_of_nested_grammar__grammar_to_import.rule_to_import + """ + p = _Lark(grammar, import_paths=[custom_loader]) + x = p.parse('N') + self.assertEqual(next(x.find_data('rule_to_import')).children, ['N']) + + custom_loader2 = FromPackageLoader('tests') + grammar = """ + %import .test_relative_import (start, WS) + %ignore WS + """ + p = _Lark(grammar, import_paths=[custom_loader2]) + x = p.parse('12 capybaras') + self.assertEqual(x.children, ['12', 'capybaras']) + @unittest.skipIf(PARSER == 'cyk', "Doesn't work for CYK") def test_prioritization(self): "Tests effect of priority on result"