Browse Source

Merge branch 'MegaIng-custom_import_sources'

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.0
Erez Sh 3 years ago
parent
commit
3aab72fffd
7 changed files with 233 additions and 52 deletions
  1. +5
    -13
      examples/advanced/python3.lark
  2. +23
    -3
      lark-stubs/lark.pyi
  3. +10
    -1
      lark/grammars/common.lark
  4. +19
    -0
      lark/grammars/python.lark
  5. +56
    -11
      lark/lark.py
  6. +88
    -24
      lark/load_grammar.py
  7. +32
    -0
      tests/test_parser.py

+ 5
- 13
examples/advanced/python3.lark View File

@@ -163,22 +163,14 @@ yield_arg: "from" test | testlist


number: DEC_NUMBER | HEX_NUMBER | BIN_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER number: DEC_NUMBER | HEX_NUMBER | BIN_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER
string: STRING | LONG_STRING string: STRING | LONG_STRING
// Tokens

NAME: /[a-zA-Z_]\w*/
COMMENT: /#[^\n]*/
_NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+


// Import terminals from standard library (grammars/python.lark)
%import python (NAME, COMMENT, STRING, LONG_STRING)
%import python (DEC_NUMBER, HEX_NUMBER, OCT_NUMBER, BIN_NUMBER, FLOAT_NUMBER, IMAG_NUMBER)


STRING : /[ubf]?r?("(?!"").*?(?<!\\)(\\\\)*?"|'(?!'').*?(?<!\\)(\\\\)*?')/i
LONG_STRING: /[ubf]?r?(""".*?(?<!\\)(\\\\)*?"""|'''.*?(?<!\\)(\\\\)*?''')/is
// Other terminals


DEC_NUMBER: /0|[1-9]\d*/i
HEX_NUMBER.2: /0x[\da-f]*/i
OCT_NUMBER.2: /0o[0-7]*/i
BIN_NUMBER.2 : /0b[0-1]*/i
FLOAT_NUMBER.2: /((\d+\.\d*|\.\d+)(e[-+]?\d+)?|\d+(e[-+]?\d+))/i
IMAG_NUMBER.2: /\d+j/i | FLOAT_NUMBER "j"i
_NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+


%ignore /[\t \f]+/ // WS %ignore /[\t \f]+/ // WS
%ignore /\\[\t \f]*\r?\n/ // LINE_CONT %ignore /\\[\t \f]*\r?\n/ // LINE_CONT


+ 23
- 3
lark-stubs/lark.pyi View File

@@ -2,7 +2,7 @@


from typing import ( from typing import (
TypeVar, Type, List, Dict, IO, Iterator, Callable, Union, Optional, TypeVar, Type, List, Dict, IO, Iterator, Callable, Union, Optional,
Literal, Protocol, Iterable,
Literal, Protocol, Tuple, Iterable,
) )
from .visitors import Transformer from .visitors import Transformer
from .lexer import Token, Lexer, TerminalDef from .lexer import Token, Lexer, TerminalDef
@@ -34,11 +34,25 @@ class LarkOptions:
cache: Union[bool, str] cache: Union[bool, str]
g_regex_flags: int g_regex_flags: int
use_bytes: bool use_bytes: bool
import_paths: List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]]
source_path: Optional[str]


class PackageResource(object):
pkg_name: str
path: str
def __init__(self, pkg_name: str, path: str):

class FromPackageLoader:
def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ...
def __call__(self, base_path: Union[None, str, PackageResource], grammar_path: str) -> Tuple[PackageResource, str]: ...




class Lark: class Lark:
source: str
grammar_source: str
source_path: str
source_grammar: str
options: LarkOptions options: LarkOptions
lexer: Lexer lexer: Lexer
terminals: List[TerminalDef] terminals: List[TerminalDef]
@@ -62,6 +76,8 @@ class Lark:
cache: Union[bool, str] = False, cache: Union[bool, str] = False,
g_regex_flags: int = ..., g_regex_flags: int = ...,
use_bytes: bool = False, use_bytes: bool = False,
import_paths: List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]] = ...,
source_path: Optional[str]=None,
): ):
... ...


@@ -71,6 +87,10 @@ class Lark:
@classmethod @classmethod
def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str] = None, **options) -> _T: def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str] = None, **options) -> _T:
... ...
@classmethod
def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_paths: Tuple[str, ...] = ..., **options) -> _T:
...


def lex(self, text: str) -> Iterator[Token]: def lex(self, text: str) -> Iterator[Token]:
... ...


+ 10
- 1
lark/grammars/common.lark View File

@@ -1,3 +1,6 @@
// Basic terminals for common use


// //
// Numbers // Numbers
// //
@@ -21,7 +24,7 @@ SIGNED_NUMBER: ["+"|"-"] NUMBER
// Strings // Strings
// //
_STRING_INNER: /.*?/ _STRING_INNER: /.*?/
_STRING_ESC_INNER: _STRING_INNER /(?<!\\)(\\\\)*?/
_STRING_ESC_INNER: _STRING_INNER /(?<!\\)(\\\\)*?/


ESCAPED_STRING : "\"" _STRING_ESC_INNER "\"" ESCAPED_STRING : "\"" _STRING_ESC_INNER "\""


@@ -48,3 +51,9 @@ CR : /\r/
LF : /\n/ LF : /\n/
NEWLINE: (CR? LF)+ NEWLINE: (CR? LF)+



// Comments
SH_COMMENT: /#[^\n]*/
CPP_COMMENT: /\/\/[^\n]*/
C_COMMENT: "/*" /.*?/s "*/"
SQL_COMMENT: /--[^\n]*/

+ 19
- 0
lark/grammars/python.lark View File

@@ -0,0 +1,19 @@
// Python terminals

NAME: /[a-zA-Z_]\w*/
COMMENT: /#[^\n]*/

STRING : /[ubf]?r?("(?!"").*?(?<!\\)(\\\\)*?"|'(?!'').*?(?<!\\)(\\\\)*?')/i
LONG_STRING: /[ubf]?r?(""".*?(?<!\\)(\\\\)*?"""|'''.*?(?<!\\)(\\\\)*?''')/is

DEC_NUMBER: /0|[1-9]\d*/i
HEX_NUMBER.2: /0x[\da-f]*/i
OCT_NUMBER.2: /0o[0-7]*/i
BIN_NUMBER.2 : /0b[0-1]*/i
FLOAT_NUMBER.2: /((\d+\.\d*|\.\d+)(e[-+]?\d+)?|\d+(e[-+]?\d+))/i
IMAG_NUMBER.2: /\d+j/i | FLOAT_NUMBER "j"i


// Comma-separated list (with an optional trailing comma)
cs_list{item}: item ("," item)* ","?
_cs_list{item}: item ("," item)* ","?

+ 56
- 11
lark/lark.py View File

@@ -4,10 +4,10 @@ from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedTok
import sys, os, pickle, hashlib import sys, os, pickle, hashlib
from io import open from io import open
import tempfile import tempfile
from warnings import warn


from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger
from .load_grammar import load_grammar
from .load_grammar import load_grammar, FromPackageLoader
from .tree import Tree from .tree import Tree
from .common import LexerConf, ParserConf from .common import LexerConf, ParserConf


@@ -92,6 +92,10 @@ class LarkOptions(Serialize):
Accept an input of type ``bytes`` instead of ``str`` (Python 3 only). Accept an input of type ``bytes`` instead of ``str`` (Python 3 only).
edit_terminals edit_terminals
A callback for editing the terminals before parse. A callback for editing the terminals before parse.
import_paths
A List of either paths or loader functions to specify from where grammars are imported
source_path
Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading


**=== End Options ===** **=== End Options ===**
""" """
@@ -126,6 +130,8 @@ class LarkOptions(Serialize):
'edit_terminals': None, 'edit_terminals': None,
'g_regex_flags': 0, 'g_regex_flags': 0,
'use_bytes': False, 'use_bytes': False,
'import_paths': [],
'source_path': None,
} }


def __init__(self, options_dict): def __init__(self, options_dict):
@@ -209,10 +215,13 @@ class Lark(Serialize):
re_module = re re_module = re


# Some, but not all file-like objects have a 'name' attribute # Some, but not all file-like objects have a 'name' attribute
try:
self.source = grammar.name
except AttributeError:
self.source = '<string>'
if self.options.source_path is None:
try:
self.source_path = grammar.name
except AttributeError:
self.source_path = '<string>'
else:
self.source_path = self.options.source_path


# Drain file-like objects to get their contents # Drain file-like objects to get their contents
try: try:
@@ -223,7 +232,7 @@ class Lark(Serialize):
grammar = read() grammar = read()


assert isinstance(grammar, STRING_TYPE) assert isinstance(grammar, STRING_TYPE)
self.grammar_source = grammar
self.source_grammar = grammar
if self.options.use_bytes: if self.options.use_bytes:
if not isascii(grammar): if not isascii(grammar):
raise ValueError("Grammar must be ascii only, when use_bytes=True") raise ValueError("Grammar must be ascii only, when use_bytes=True")
@@ -285,8 +294,8 @@ class Lark(Serialize):
if self.options.ambiguity not in _VALID_AMBIGUITY_OPTIONS: if self.options.ambiguity not in _VALID_AMBIGUITY_OPTIONS:
raise ValueError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS)) raise ValueError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS))


# Parse the grammar file and compose the grammars (TODO)
self.grammar = load_grammar(grammar, self.source, re_module, self.options.keep_all_tokens)
# Parse the grammar file and compose the grammars
self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)


if self.options.postlex is not None: if self.options.postlex is not None:
terminals_to_keep = set(self.options.postlex.always_accept) terminals_to_keep = set(self.options.postlex.always_accept)
@@ -395,7 +404,7 @@ class Lark(Serialize):
options.update(kwargs) options.update(kwargs)
self.options = LarkOptions.deserialize(options, memo) self.options = LarkOptions.deserialize(options, memo)
self.rules = [Rule.deserialize(r, memo) for r in data['rules']] self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
self.source = '<deserialized>'
self.source_path = '<deserialized>'
self._prepare_callbacks() self._prepare_callbacks()
self.parser = self.parser_class.deserialize( self.parser = self.parser_class.deserialize(
data['parser'], data['parser'],
@@ -430,8 +439,26 @@ class Lark(Serialize):
with open(grammar_filename, encoding='utf8') as f: with open(grammar_filename, encoding='utf8') as f:
return cls(f, **options) return cls(f, **options)


@classmethod
def open_from_package(cls, package, grammar_path, search_paths=("",), **options):
"""Create an instance of Lark with the grammar loaded from within the package `package`.
This allows grammar loading from zipapps.

Imports in the grammar will use the `package` and `search_paths` provided, through `FromPackageLoader`

Example:

Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...)
"""
package = FromPackageLoader(package, search_paths)
full_path, text = package(None, grammar_path)
options.setdefault('source_path', full_path)
options.setdefault('import_paths', [])
options['import_paths'].append(package)
return cls(text, **options)

def __repr__(self): def __repr__(self):
return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source, self.options.parser, self.options.lexer)
return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer)




def lex(self, text): def lex(self, text):
@@ -491,5 +518,23 @@ class Lark(Serialize):
except UnexpectedCharacters as e2: except UnexpectedCharacters as e2:
e = e2 e = e2


@property
def source(self):
warn("Lark.source attribute has been renamed to Lark.source_path", DeprecationWarning)
return self.source_path

@source.setter
def source(self, value):
self.source_path = value

@property
def grammar_source(self):
warn("Lark.grammar_source attribute has been renamed to Lark.source_grammar", DeprecationWarning)
return self.source_grammar

@grammar_source.setter
def grammar_source(self, value):
self.source_grammar = value



###} ###}

+ 88
- 24
lark/load_grammar.py View File

@@ -4,6 +4,7 @@ import os.path
import sys import sys
from copy import copy, deepcopy from copy import copy, deepcopy
from io import open from io import open
import pkgutil


from .utils import bfs, eval_escaping, Py36, logger, classify_bool from .utils import bfs, eval_escaping, Py36, logger, classify_bool
from .lexer import Token, TerminalDef, PatternStr, PatternRE from .lexer import Token, TerminalDef, PatternStr, PatternRE
@@ -20,7 +21,7 @@ from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transfo
inline_args = v_args(inline=True) inline_args = v_args(inline=True)


__path__ = os.path.dirname(__file__) __path__ = os.path.dirname(__file__)
IMPORT_PATHS = [os.path.join(__path__, 'grammars')]
IMPORT_PATHS = ['grammars']


EXT = '.lark' EXT = '.lark'


@@ -648,6 +649,58 @@ class Grammar:
return terminals, compiled_rules, self.ignore return terminals, compiled_rules, self.ignore




class PackageResource(object):
"""
Represents a path inside a Package. Used by `FromPackageLoader`
"""
def __init__(self, pkg_name, path):
self.pkg_name = pkg_name
self.path = path

def __str__(self):
return "<%s: %s>" % (self.pkg_name, self.path)

def __repr__(self):
return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.path)

class FromPackageLoader(object):
"""
Provides a simple way of creating custom import loaders that load from packages via ``pkgutil.get_data`` instead of using `open`.
This allows them to be compatible even from within zip files.

Relative imports are handled, so you can just freely use them.

pkg_name: The name of the package. You can probably provide `__name__` most of the time
search_paths: All the path that will be search on absolute imports.
"""
def __init__(self, pkg_name, search_paths=("", )):
self.pkg_name = pkg_name
self.search_paths = search_paths

def __repr__(self):
return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.search_paths)

def __call__(self, base_path, grammar_path):
if base_path is None:
to_try = self.search_paths
else:
# Check whether or not the importing grammar was loaded by this module.
if not isinstance(base_path, PackageResource) or base_path.pkg_name != self.pkg_name:
# Technically false, but FileNotFound doesn't exist in python2.7, and this message should never reach the end user anyway
raise IOError()
to_try = [base_path.path]
for path in to_try:
full_path = os.path.join(path, grammar_path)
try:
text = pkgutil.get_data(self.pkg_name, full_path)
except IOError:
continue
else:
return PackageResource(self.pkg_name, full_path), text.decode()
raise IOError()

stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS)



_imported_grammars = {} _imported_grammars = {}


@@ -787,39 +840,47 @@ class GrammarLoader:
('%ignore expects a value', ['%ignore %import\n']), ('%ignore expects a value', ['%ignore %import\n']),
] ]


def __init__(self, re_module, global_keep_all_tokens):
def __init__(self, global_keep_all_tokens):
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]


rules = [options_from_rule(name, None, x) for name, x in RULES.items()] rules = [options_from_rule(name, None, x) for name, x in RULES.items()]
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, _p, xs, o in rules for i, x in enumerate(xs)] rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, _p, xs, o in rules for i, x in enumerate(xs)]
callback = ParseTreeBuilder(rules, ST).create_callback() callback = ParseTreeBuilder(rules, ST).create_callback()
lexer_conf = LexerConf(terminals, re_module, ['WS', 'COMMENT'])
import re
lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT'])


parser_conf = ParserConf(rules, callback, ['start']) parser_conf = ParserConf(rules, callback, ['start'])
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf) self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)


self.canonize_tree = CanonizeTree() self.canonize_tree = CanonizeTree()
self.re_module = re_module
self.global_keep_all_tokens = global_keep_all_tokens self.global_keep_all_tokens = global_keep_all_tokens


def import_grammar(self, grammar_path, base_paths=[]):
def import_grammar(self, grammar_path, base_path=None, import_paths=[]):
if grammar_path not in _imported_grammars: if grammar_path not in _imported_grammars:
import_paths = base_paths + IMPORT_PATHS
for import_path in import_paths:
with suppress(IOError):
joined_path = os.path.join(import_path, grammar_path)
with open(joined_path, encoding='utf8') as f:
text = f.read()
grammar = self.load_grammar(text, joined_path)
# import_paths take priority over base_path since they should handle relative imports and ignore everything else.
to_try = import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader]
for source in to_try:
try:
if callable(source):
joined_path, text = source(base_path, grammar_path)
else:
joined_path = os.path.join(source, grammar_path)
with open(joined_path, encoding='utf8') as f:
text = f.read()
except IOError:
continue
else:
grammar = self.load_grammar(text, joined_path, import_paths)
_imported_grammars[grammar_path] = grammar _imported_grammars[grammar_path] = grammar
break break
else: else:
open(grammar_path, encoding='utf8') # Force a file not found error
# Search failed. Make Python throw a nice error.
open(grammar_path, encoding='utf8')
assert False assert False


return _imported_grammars[grammar_path] return _imported_grammars[grammar_path]


def load_grammar(self, grammar_text, grammar_name='<?>'):
def load_grammar(self, grammar_text, grammar_name='<?>', import_paths=[]):
"Parse grammar_text, verify, and create Grammar object. Display nice messages on error." "Parse grammar_text, verify, and create Grammar object. Display nice messages on error."


try: try:
@@ -873,7 +934,7 @@ class GrammarLoader:
aliases = {name: arg1 or name} # Aliases if exist aliases = {name: arg1 or name} # Aliases if exist


if path_node.data == 'import_lib': # Import from library if path_node.data == 'import_lib': # Import from library
base_paths = []
base_path = None
else: # Relative import else: # Relative import
if grammar_name == '<string>': # Import relative to script file path if grammar is coded in script if grammar_name == '<string>': # Import relative to script file path if grammar is coded in script
try: try:
@@ -883,16 +944,19 @@ class GrammarLoader:
else: else:
base_file = grammar_name # Import relative to grammar file path if external grammar file base_file = grammar_name # Import relative to grammar file path if external grammar file
if base_file: if base_file:
base_paths = [os.path.split(base_file)[0]]
if isinstance(base_file, PackageResource):
base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0])
else:
base_path = os.path.split(base_file)[0]
else: else:
base_paths = [os.path.abspath(os.path.curdir)]
base_path = os.path.abspath(os.path.curdir)


try: try:
import_base_paths, import_aliases = imports[dotted_path]
assert base_paths == import_base_paths, 'Inconsistent base_paths for %s.' % '.'.join(dotted_path)
import_base_path, import_aliases = imports[dotted_path]
assert base_path == import_base_path, 'Inconsistent base_path for %s.' % '.'.join(dotted_path)
import_aliases.update(aliases) import_aliases.update(aliases)
except KeyError: except KeyError:
imports[dotted_path] = base_paths, aliases
imports[dotted_path] = base_path, aliases


elif stmt.data == 'declare': elif stmt.data == 'declare':
for t in stmt.children: for t in stmt.children:
@@ -901,9 +965,9 @@ class GrammarLoader:
assert False, stmt assert False, stmt


# import grammars # import grammars
for dotted_path, (base_paths, aliases) in imports.items():
for dotted_path, (base_path, aliases) in imports.items():
grammar_path = os.path.join(*dotted_path) + EXT grammar_path = os.path.join(*dotted_path) + EXT
g = self.import_grammar(grammar_path, base_paths=base_paths)
g = self.import_grammar(grammar_path, base_path=base_path, import_paths=import_paths)
new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases) new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases)


term_defs += new_td term_defs += new_td
@@ -987,5 +1051,5 @@ class GrammarLoader:






def load_grammar(grammar, source, re_, global_keep_all_tokens):
return GrammarLoader(re_, global_keep_all_tokens).load_grammar(grammar, source)
def load_grammar(grammar, source, import_paths, global_keep_all_tokens):
return GrammarLoader(global_keep_all_tokens).load_grammar(grammar, source, import_paths)

+ 32
- 0
tests/test_parser.py View File

@@ -11,6 +11,7 @@ from copy import copy, deepcopy
from lark.utils import Py36, isascii from lark.utils import Py36, isascii


from lark import Token from lark import Token
from lark.load_grammar import FromPackageLoader


try: try:
from cStringIO import StringIO as cStringIO from cStringIO import StringIO as cStringIO
@@ -1805,6 +1806,37 @@ def _make_parser_test(LEXER, PARSER):
tree = parser.parse(test_file) tree = parser.parse(test_file)
self.assertEqual(tree.children, [Token('B', 'A')]) self.assertEqual(tree.children, [Token('B', 'A')])


def test_import_custom_sources(self):
custom_loader = FromPackageLoader('tests', ('grammars', ))

grammar = """
start: startab

%import ab.startab
"""

p = _Lark(grammar, import_paths=[custom_loader])
self.assertEqual(p.parse('ab'),
Tree('start', [Tree('startab', [Tree('ab__expr', [Token('ab__A', 'a'), Token('ab__B', 'b')])])]))

grammar = """
start: rule_to_import

%import test_relative_import_of_nested_grammar__grammar_to_import.rule_to_import
"""
p = _Lark(grammar, import_paths=[custom_loader])
x = p.parse('N')
self.assertEqual(next(x.find_data('rule_to_import')).children, ['N'])

custom_loader2 = FromPackageLoader('tests')
grammar = """
%import .test_relative_import (start, WS)
%ignore WS
"""
p = _Lark(grammar, import_paths=[custom_loader2])
x = p.parse('12 capybaras')
self.assertEqual(x.children, ['12', 'capybaras'])

@unittest.skipIf(PARSER == 'cyk', "Doesn't work for CYK") @unittest.skipIf(PARSER == 'cyk', "Doesn't work for CYK")
def test_prioritization(self): def test_prioritization(self):
"Tests effect of priority on result" "Tests effect of priority on result"


Loading…
Cancel
Save