Browse Source

Merge branch 'custom_import_sources' of into MegaIng-custom_import_sources

Resolved conflicts manually /erez
Erez Sh 4 years ago
4 changed files with 198 additions and 37 deletions
  1. +23
  2. +55
  3. +88
  4. +32

+ 23
- 3
lark-stubs/lark.pyi View File

@@ -2,7 +2,7 @@

from typing import (
TypeVar, Type, List, Dict, IO, Iterator, Callable, Union, Optional,
Literal, Protocol, Iterable,
Literal, Protocol, Tuple, Iterable,
from .visitors import Transformer
from .lexer import Token, Lexer, TerminalDef
@@ -34,11 +34,25 @@ class LarkOptions:
cache: Union[bool, str]
g_regex_flags: int
use_bytes: bool
import_paths: List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]]
source_path: Optional[str]

class PackageResource(object):
pkg_name: str
path: str
def __init__(self, pkg_name: str, path: str):

class FromPackageLoader:
def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ...
def __call__(self, base_path: Union[None, str, PackageResource], grammar_path: str) -> Tuple[PackageResource, str]: ...

class Lark:
source: str
grammar_source: str
source_path: str
source_grammar: str
options: LarkOptions
lexer: Lexer
terminals: List[TerminalDef]
@@ -62,6 +76,8 @@ class Lark:
cache: Union[bool, str] = False,
g_regex_flags: int = ...,
use_bytes: bool = False,
import_paths: List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]] = ...,
source_path: Optional[str]=None,

@@ -71,6 +87,10 @@ class Lark:
def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str] = None, **options) -> _T:
def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_paths: Tuple[str, ...] = ..., **options) -> _T:

def lex(self, text: str) -> Iterator[Token]:

+ 55
- 10
lark/ View File

@@ -4,10 +4,10 @@ from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedTok
import sys, os, pickle, hashlib
from io import open
import tempfile
from warnings import warn

from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger
from .load_grammar import load_grammar
from .load_grammar import load_grammar, FromPackageLoader
from .tree import Tree
from .common import LexerConf, ParserConf

@@ -92,6 +92,10 @@ class LarkOptions(Serialize):
Accept an input of type ``bytes`` instead of ``str`` (Python 3 only).
A callback for editing the terminals before parse.
A List of either paths or loader functions to specify from where grammars are imported
Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading

**=== End Options ===**
@@ -126,6 +130,8 @@ class LarkOptions(Serialize):
'edit_terminals': None,
'g_regex_flags': 0,
'use_bytes': False,
'import_paths': [],
'source_path': None,

def __init__(self, options_dict):
@@ -209,10 +215,13 @@ class Lark(Serialize):
re_module = re

# Some, but not all file-like objects have a 'name' attribute
self.source =
except AttributeError:
self.source = '<string>'
if self.options.source_path is None:
self.source_path =
except AttributeError:
self.source_path = '<string>'
self.source_path = self.options.source_path

# Drain file-like objects to get their contents
@@ -223,7 +232,7 @@ class Lark(Serialize):
grammar = read()

assert isinstance(grammar, STRING_TYPE)
self.grammar_source = grammar
self.source_grammar = grammar
if self.options.use_bytes:
if not isascii(grammar):
raise ValueError("Grammar must be ascii only, when use_bytes=True")
@@ -286,7 +295,7 @@ class Lark(Serialize):
raise ValueError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS))

# Parse the grammar file and compose the grammars (TODO)
self.grammar = load_grammar(grammar, self.source, re_module, self.options.keep_all_tokens)
self.grammar = load_grammar(grammar, self.source, self.options.import_paths, self.options.keep_all_tokens)

if self.options.postlex is not None:
terminals_to_keep = set(self.options.postlex.always_accept)
@@ -395,7 +404,7 @@ class Lark(Serialize):
self.options = LarkOptions.deserialize(options, memo)
self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
self.source = '<deserialized>'
self.source_path = '<deserialized>'
self.parser = self.parser_class.deserialize(
@@ -430,8 +439,26 @@ class Lark(Serialize):
with open(grammar_filename, encoding='utf8') as f:
return cls(f, **options)

def open_from_package(cls, package, grammar_path, search_paths=("",), **options):
"""Create an instance of Lark with the grammar loaded from within the package `package`.
This allows grammar loading from zipapps.

Imports in the grammar will use the `package` and `search_paths` provided, through `FromPackageLoader`


Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...)
package = FromPackageLoader(package, search_paths)
full_path, text = package(None, grammar_path)
options.setdefault('source_path', full_path)
options.setdefault('import_paths', [])
return cls(text, **options)

def __repr__(self):
return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source, self.options.parser, self.options.lexer)
return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer)

def lex(self, text):
@@ -491,5 +518,23 @@ class Lark(Serialize):
except UnexpectedCharacters as e2:
e = e2

def source(self):
warn("Lark.source attribute has been renamed to Lark.source_path", DeprecationWarning)
return self.source_path

def source(self, value):
self.source_path = value

def grammar_source(self):
warn("Lark.grammar_source attribute has been renamed to Lark.source_grammar", DeprecationWarning)
return self.source_grammar

def grammar_source(self, value):
self.source_grammar = value


+ 88
- 24
lark/ View File

@@ -4,6 +4,7 @@ import os.path
import sys
from copy import copy, deepcopy
from io import open
import pkgutil

from .utils import bfs, eval_escaping, Py36, logger, classify_bool
from .lexer import Token, TerminalDef, PatternStr, PatternRE
@@ -20,7 +21,7 @@ from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transfo
inline_args = v_args(inline=True)

__path__ = os.path.dirname(__file__)
IMPORT_PATHS = [os.path.join(__path__, 'grammars')]
IMPORT_PATHS = ['grammars']

EXT = '.lark'

@@ -648,6 +649,58 @@ class Grammar:
return terminals, compiled_rules, self.ignore

class PackageResource(object):
Represents a path inside a Package. Used by `FromPackageLoader`
def __init__(self, pkg_name, path):
self.pkg_name = pkg_name
self.path = path

def __str__(self):
return "<%s: %s>" % (self.pkg_name, self.path)

def __repr__(self):
return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.path)

class FromPackageLoader(object):
Provides a simple way of creating custom import loaders that load from packages via ``pkgutil.get_data`` instead of using `open`.
This allows them to be compatible even from within zip files.

Relative imports are handled, so you can just freely use them.

pkg_name: The name of the package. You can probably provide `__name__` most of the time
search_paths: All the path that will be search on absolute imports.
def __init__(self, pkg_name, search_paths=("", )):
self.pkg_name = pkg_name
self.search_paths = search_paths

def __repr__(self):
return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.search_paths)

def __call__(self, base_path, grammar_path):
if base_path is None:
to_try = self.search_paths
# Check whether or not the importing grammar was loaded by this module.
if not isinstance(base_path, PackageResource) or base_path.pkg_name != self.pkg_name:
# Technically false, but FileNotFound doesn't exist in python2.7, and this message should never reach the end user anyway
raise IOError()
to_try = [base_path.path]
for path in to_try:
full_path = os.path.join(path, grammar_path)
text = pkgutil.get_data(self.pkg_name, full_path)
except IOError:
return PackageResource(self.pkg_name, full_path), text.decode()
raise IOError()

stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS)

_imported_grammars = {}

@@ -787,39 +840,47 @@ class GrammarLoader:
('%ignore expects a value', ['%ignore %import\n']),

def __init__(self, re_module, global_keep_all_tokens):
def __init__(self, global_keep_all_tokens):
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]

rules = [options_from_rule(name, None, x) for name, x in RULES.items()]
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, _p, xs, o in rules for i, x in enumerate(xs)]
callback = ParseTreeBuilder(rules, ST).create_callback()
lexer_conf = LexerConf(terminals, re_module, ['WS', 'COMMENT'])
import re
lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT'])

parser_conf = ParserConf(rules, callback, ['start'])
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)

self.canonize_tree = CanonizeTree()
self.re_module = re_module
self.global_keep_all_tokens = global_keep_all_tokens

def import_grammar(self, grammar_path, base_paths=[]):
def import_grammar(self, grammar_path, base_path=None, import_paths=[]):
if grammar_path not in _imported_grammars:
import_paths = base_paths + IMPORT_PATHS
for import_path in import_paths:
with suppress(IOError):
joined_path = os.path.join(import_path, grammar_path)
with open(joined_path, encoding='utf8') as f:
text =
grammar = self.load_grammar(text, joined_path)
# import_paths take priority over base_path since they should handle relative imports and ignore everything else.
to_try = import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader]
for source in to_try:
if callable(source):
joined_path, text = source(base_path, grammar_path)
joined_path = os.path.join(source, grammar_path)
with open(joined_path, encoding='utf8') as f:
text =
except IOError:
grammar = self.load_grammar(text, joined_path, import_paths)
_imported_grammars[grammar_path] = grammar
open(grammar_path, encoding='utf8') # Force a file not found error
# Search failed. Make Python throw a nice error.
open(grammar_path, encoding='utf8')
assert False

return _imported_grammars[grammar_path]

def load_grammar(self, grammar_text, grammar_name='<?>'):
def load_grammar(self, grammar_text, grammar_name='<?>', import_paths=[]):
"Parse grammar_text, verify, and create Grammar object. Display nice messages on error."

@@ -873,7 +934,7 @@ class GrammarLoader:
aliases = {name: arg1 or name} # Aliases if exist

if == 'import_lib': # Import from library
base_paths = []
base_path = None
else: # Relative import
if grammar_name == '<string>': # Import relative to script file path if grammar is coded in script
@@ -883,16 +944,19 @@ class GrammarLoader:
base_file = grammar_name # Import relative to grammar file path if external grammar file
if base_file:
base_paths = [os.path.split(base_file)[0]]
if isinstance(base_file, PackageResource):
base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0])
base_path = os.path.split(base_file)[0]
base_paths = [os.path.abspath(os.path.curdir)]
base_path = os.path.abspath(os.path.curdir)

import_base_paths, import_aliases = imports[dotted_path]
assert base_paths == import_base_paths, 'Inconsistent base_paths for %s.' % '.'.join(dotted_path)
import_base_path, import_aliases = imports[dotted_path]
assert base_path == import_base_path, 'Inconsistent base_path for %s.' % '.'.join(dotted_path)
except KeyError:
imports[dotted_path] = base_paths, aliases
imports[dotted_path] = base_path, aliases

elif == 'declare':
for t in stmt.children:
@@ -901,9 +965,9 @@ class GrammarLoader:
assert False, stmt

# import grammars
for dotted_path, (base_paths, aliases) in imports.items():
for dotted_path, (base_path, aliases) in imports.items():
grammar_path = os.path.join(*dotted_path) + EXT
g = self.import_grammar(grammar_path, base_paths=base_paths)
g = self.import_grammar(grammar_path, base_path=base_path, import_paths=import_paths)
new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases)

term_defs += new_td
@@ -987,5 +1051,5 @@ class GrammarLoader:

def load_grammar(grammar, source, re_, global_keep_all_tokens):
return GrammarLoader(re_, global_keep_all_tokens).load_grammar(grammar, source)
def load_grammar(grammar, source, import_paths, global_keep_all_tokens):
return GrammarLoader(global_keep_all_tokens).load_grammar(grammar, source, import_paths)

+ 32
- 0
tests/ View File

@@ -11,6 +11,7 @@ from copy import copy, deepcopy
from lark.utils import Py36, isascii

from lark import Token
from lark.load_grammar import FromPackageLoader

from cStringIO import StringIO as cStringIO
@@ -1805,6 +1806,37 @@ def _make_parser_test(LEXER, PARSER):
tree = parser.parse(test_file)
self.assertEqual(tree.children, [Token('B', 'A')])

def test_import_custom_sources(self):
custom_loader = FromPackageLoader('tests', ('grammars', ))

grammar = """
start: startab

%import ab.startab

p = _Lark(grammar, import_paths=[custom_loader])
Tree('start', [Tree('startab', [Tree('ab__expr', [Token('ab__A', 'a'), Token('ab__B', 'b')])])]))

grammar = """
start: rule_to_import

%import test_relative_import_of_nested_grammar__grammar_to_import.rule_to_import
p = _Lark(grammar, import_paths=[custom_loader])
x = p.parse('N')
self.assertEqual(next(x.find_data('rule_to_import')).children, ['N'])

custom_loader2 = FromPackageLoader('tests')
grammar = """
%import .test_relative_import (start, WS)
%ignore WS
p = _Lark(grammar, import_paths=[custom_loader2])
x = p.parse('12 capybaras')
self.assertEqual(x.children, ['12', 'capybaras'])

@unittest.skipIf(PARSER == 'cyk', "Doesn't work for CYK")
def test_prioritization(self):
"Tests effect of priority on result"
