Преглед на файлове

Merge branch 'MegaIng-custom_import_sources'

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.0
Erez Sh преди 3 години
родител
ревизия
3aab72fffd
променени са 7 файла, в които са добавени 233 реда и са изтрити 52 реда
  1. +5
    -13
      examples/advanced/python3.lark
  2. +23
    -3
      lark-stubs/lark.pyi
  3. +10
    -1
      lark/grammars/common.lark
  4. +19
    -0
      lark/grammars/python.lark
  5. +56
    -11
      lark/lark.py
  6. +88
    -24
      lark/load_grammar.py
  7. +32
    -0
      tests/test_parser.py

+ 5
- 13
examples/advanced/python3.lark Целия файл

@@ -163,22 +163,14 @@ yield_arg: "from" test | testlist

number: DEC_NUMBER | HEX_NUMBER | BIN_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER
string: STRING | LONG_STRING
// Tokens

NAME: /[a-zA-Z_]\w*/
COMMENT: /#[^\n]*/
_NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+

// Import terminals from standard library (grammars/python.lark)
%import python (NAME, COMMENT, STRING, LONG_STRING)
%import python (DEC_NUMBER, HEX_NUMBER, OCT_NUMBER, BIN_NUMBER, FLOAT_NUMBER, IMAG_NUMBER)

STRING : /[ubf]?r?("(?!"").*?(?<!\\)(\\\\)*?"|'(?!'').*?(?<!\\)(\\\\)*?')/i
LONG_STRING: /[ubf]?r?(""".*?(?<!\\)(\\\\)*?"""|'''.*?(?<!\\)(\\\\)*?''')/is
// Other terminals

DEC_NUMBER: /0|[1-9]\d*/i
HEX_NUMBER.2: /0x[\da-f]*/i
OCT_NUMBER.2: /0o[0-7]*/i
BIN_NUMBER.2 : /0b[0-1]*/i
FLOAT_NUMBER.2: /((\d+\.\d*|\.\d+)(e[-+]?\d+)?|\d+(e[-+]?\d+))/i
IMAG_NUMBER.2: /\d+j/i | FLOAT_NUMBER "j"i
_NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+

%ignore /[\t \f]+/ // WS
%ignore /\\[\t \f]*\r?\n/ // LINE_CONT


+ 23
- 3
lark-stubs/lark.pyi Целия файл

@@ -2,7 +2,7 @@

from typing import (
TypeVar, Type, List, Dict, IO, Iterator, Callable, Union, Optional,
Literal, Protocol, Iterable,
Literal, Protocol, Tuple, Iterable,
)
from .visitors import Transformer
from .lexer import Token, Lexer, TerminalDef
@@ -34,11 +34,25 @@ class LarkOptions:
cache: Union[bool, str]
g_regex_flags: int
use_bytes: bool
import_paths: List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]]
source_path: Optional[str]


class PackageResource(object):
pkg_name: str
path: str
def __init__(self, pkg_name: str, path: str):

class FromPackageLoader:
def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ...
def __call__(self, base_path: Union[None, str, PackageResource], grammar_path: str) -> Tuple[PackageResource, str]: ...


class Lark:
source: str
grammar_source: str
source_path: str
source_grammar: str
options: LarkOptions
lexer: Lexer
terminals: List[TerminalDef]
@@ -62,6 +76,8 @@ class Lark:
cache: Union[bool, str] = False,
g_regex_flags: int = ...,
use_bytes: bool = False,
import_paths: List[Union[str, Callable[[Union[None, str, PackageResource], str], Tuple[str, str]]]] = ...,
source_path: Optional[str]=None,
):
...

@@ -71,6 +87,10 @@ class Lark:
@classmethod
def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str] = None, **options) -> _T:
...
@classmethod
def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_paths: Tuple[str, ...] = ..., **options) -> _T:
...

def lex(self, text: str) -> Iterator[Token]:
...


+ 10
- 1
lark/grammars/common.lark Целия файл

@@ -1,3 +1,6 @@
// Basic terminals for common use


//
// Numbers
//
@@ -21,7 +24,7 @@ SIGNED_NUMBER: ["+"|"-"] NUMBER
// Strings
//
_STRING_INNER: /.*?/
_STRING_ESC_INNER: _STRING_INNER /(?<!\\)(\\\\)*?/
_STRING_ESC_INNER: _STRING_INNER /(?<!\\)(\\\\)*?/

ESCAPED_STRING : "\"" _STRING_ESC_INNER "\""

@@ -48,3 +51,9 @@ CR : /\r/
LF : /\n/
NEWLINE: (CR? LF)+


// Comments
SH_COMMENT: /#[^\n]*/
CPP_COMMENT: /\/\/[^\n]*/
C_COMMENT: "/*" /.*?/s "*/"
SQL_COMMENT: /--[^\n]*/

+ 19
- 0
lark/grammars/python.lark Целия файл

@@ -0,0 +1,19 @@
// Python terminals

NAME: /[a-zA-Z_]\w*/
COMMENT: /#[^\n]*/

STRING : /[ubf]?r?("(?!"").*?(?<!\\)(\\\\)*?"|'(?!'').*?(?<!\\)(\\\\)*?')/i
LONG_STRING: /[ubf]?r?(""".*?(?<!\\)(\\\\)*?"""|'''.*?(?<!\\)(\\\\)*?''')/is

DEC_NUMBER: /0|[1-9]\d*/i
HEX_NUMBER.2: /0x[\da-f]*/i
OCT_NUMBER.2: /0o[0-7]*/i
BIN_NUMBER.2 : /0b[0-1]*/i
FLOAT_NUMBER.2: /((\d+\.\d*|\.\d+)(e[-+]?\d+)?|\d+(e[-+]?\d+))/i
IMAG_NUMBER.2: /\d+j/i | FLOAT_NUMBER "j"i


// Comma-separated list (with an optional trailing comma)
cs_list{item}: item ("," item)* ","?
_cs_list{item}: item ("," item)* ","?

+ 56
- 11
lark/lark.py Целия файл

@@ -4,10 +4,10 @@ from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedTok
import sys, os, pickle, hashlib
from io import open
import tempfile
from warnings import warn

from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger
from .load_grammar import load_grammar
from .load_grammar import load_grammar, FromPackageLoader
from .tree import Tree
from .common import LexerConf, ParserConf

@@ -92,6 +92,10 @@ class LarkOptions(Serialize):
Accept an input of type ``bytes`` instead of ``str`` (Python 3 only).
edit_terminals
A callback for editing the terminals before parse.
import_paths
A List of either paths or loader functions to specify from where grammars are imported
source_path
Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading

**=== End Options ===**
"""
@@ -126,6 +130,8 @@ class LarkOptions(Serialize):
'edit_terminals': None,
'g_regex_flags': 0,
'use_bytes': False,
'import_paths': [],
'source_path': None,
}

def __init__(self, options_dict):
@@ -209,10 +215,13 @@ class Lark(Serialize):
re_module = re

# Some, but not all file-like objects have a 'name' attribute
try:
self.source = grammar.name
except AttributeError:
self.source = '<string>'
if self.options.source_path is None:
try:
self.source_path = grammar.name
except AttributeError:
self.source_path = '<string>'
else:
self.source_path = self.options.source_path

# Drain file-like objects to get their contents
try:
@@ -223,7 +232,7 @@ class Lark(Serialize):
grammar = read()

assert isinstance(grammar, STRING_TYPE)
self.grammar_source = grammar
self.source_grammar = grammar
if self.options.use_bytes:
if not isascii(grammar):
raise ValueError("Grammar must be ascii only, when use_bytes=True")
@@ -285,8 +294,8 @@ class Lark(Serialize):
if self.options.ambiguity not in _VALID_AMBIGUITY_OPTIONS:
raise ValueError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS))

# Parse the grammar file and compose the grammars (TODO)
self.grammar = load_grammar(grammar, self.source, re_module, self.options.keep_all_tokens)
# Parse the grammar file and compose the grammars
self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)

if self.options.postlex is not None:
terminals_to_keep = set(self.options.postlex.always_accept)
@@ -395,7 +404,7 @@ class Lark(Serialize):
options.update(kwargs)
self.options = LarkOptions.deserialize(options, memo)
self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
self.source = '<deserialized>'
self.source_path = '<deserialized>'
self._prepare_callbacks()
self.parser = self.parser_class.deserialize(
data['parser'],
@@ -430,8 +439,26 @@ class Lark(Serialize):
with open(grammar_filename, encoding='utf8') as f:
return cls(f, **options)

@classmethod
def open_from_package(cls, package, grammar_path, search_paths=("",), **options):
"""Create an instance of Lark with the grammar loaded from within the package `package`.
This allows grammar loading from zipapps.

Imports in the grammar will use the `package` and `search_paths` provided, through `FromPackageLoader`

Example:

Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...)
"""
package = FromPackageLoader(package, search_paths)
full_path, text = package(None, grammar_path)
options.setdefault('source_path', full_path)
options.setdefault('import_paths', [])
options['import_paths'].append(package)
return cls(text, **options)

def __repr__(self):
return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source, self.options.parser, self.options.lexer)
return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source_path, self.options.parser, self.options.lexer)


def lex(self, text):
@@ -491,5 +518,23 @@ class Lark(Serialize):
except UnexpectedCharacters as e2:
e = e2

@property
def source(self):
warn("Lark.source attribute has been renamed to Lark.source_path", DeprecationWarning)
return self.source_path

@source.setter
def source(self, value):
self.source_path = value

@property
def grammar_source(self):
warn("Lark.grammar_source attribute has been renamed to Lark.source_grammar", DeprecationWarning)
return self.source_grammar

@grammar_source.setter
def grammar_source(self, value):
self.source_grammar = value


###}

+ 88
- 24
lark/load_grammar.py Целия файл

@@ -4,6 +4,7 @@ import os.path
import sys
from copy import copy, deepcopy
from io import open
import pkgutil

from .utils import bfs, eval_escaping, Py36, logger, classify_bool
from .lexer import Token, TerminalDef, PatternStr, PatternRE
@@ -20,7 +21,7 @@ from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transfo
inline_args = v_args(inline=True)

__path__ = os.path.dirname(__file__)
IMPORT_PATHS = [os.path.join(__path__, 'grammars')]
IMPORT_PATHS = ['grammars']

EXT = '.lark'

@@ -648,6 +649,58 @@ class Grammar:
return terminals, compiled_rules, self.ignore


class PackageResource(object):
"""
Represents a path inside a Package. Used by `FromPackageLoader`
"""
def __init__(self, pkg_name, path):
self.pkg_name = pkg_name
self.path = path

def __str__(self):
return "<%s: %s>" % (self.pkg_name, self.path)

def __repr__(self):
return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.path)

class FromPackageLoader(object):
"""
Provides a simple way of creating custom import loaders that load from packages via ``pkgutil.get_data`` instead of using `open`.
This allows them to be compatible even from within zip files.

Relative imports are handled, so you can just freely use them.

pkg_name: The name of the package. You can probably provide `__name__` most of the time
search_paths: All the path that will be search on absolute imports.
"""
def __init__(self, pkg_name, search_paths=("", )):
self.pkg_name = pkg_name
self.search_paths = search_paths

def __repr__(self):
return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.search_paths)

def __call__(self, base_path, grammar_path):
if base_path is None:
to_try = self.search_paths
else:
# Check whether or not the importing grammar was loaded by this module.
if not isinstance(base_path, PackageResource) or base_path.pkg_name != self.pkg_name:
# Technically false, but FileNotFound doesn't exist in python2.7, and this message should never reach the end user anyway
raise IOError()
to_try = [base_path.path]
for path in to_try:
full_path = os.path.join(path, grammar_path)
try:
text = pkgutil.get_data(self.pkg_name, full_path)
except IOError:
continue
else:
return PackageResource(self.pkg_name, full_path), text.decode()
raise IOError()

stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS)


_imported_grammars = {}

@@ -787,39 +840,47 @@ class GrammarLoader:
('%ignore expects a value', ['%ignore %import\n']),
]

def __init__(self, re_module, global_keep_all_tokens):
def __init__(self, global_keep_all_tokens):
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]

rules = [options_from_rule(name, None, x) for name, x in RULES.items()]
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, _p, xs, o in rules for i, x in enumerate(xs)]
callback = ParseTreeBuilder(rules, ST).create_callback()
lexer_conf = LexerConf(terminals, re_module, ['WS', 'COMMENT'])
import re
lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT'])

parser_conf = ParserConf(rules, callback, ['start'])
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)

self.canonize_tree = CanonizeTree()
self.re_module = re_module
self.global_keep_all_tokens = global_keep_all_tokens

def import_grammar(self, grammar_path, base_paths=[]):
def import_grammar(self, grammar_path, base_path=None, import_paths=[]):
if grammar_path not in _imported_grammars:
import_paths = base_paths + IMPORT_PATHS
for import_path in import_paths:
with suppress(IOError):
joined_path = os.path.join(import_path, grammar_path)
with open(joined_path, encoding='utf8') as f:
text = f.read()
grammar = self.load_grammar(text, joined_path)
# import_paths take priority over base_path since they should handle relative imports and ignore everything else.
to_try = import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader]
for source in to_try:
try:
if callable(source):
joined_path, text = source(base_path, grammar_path)
else:
joined_path = os.path.join(source, grammar_path)
with open(joined_path, encoding='utf8') as f:
text = f.read()
except IOError:
continue
else:
grammar = self.load_grammar(text, joined_path, import_paths)
_imported_grammars[grammar_path] = grammar
break
else:
open(grammar_path, encoding='utf8') # Force a file not found error
# Search failed. Make Python throw a nice error.
open(grammar_path, encoding='utf8')
assert False

return _imported_grammars[grammar_path]

def load_grammar(self, grammar_text, grammar_name='<?>'):
def load_grammar(self, grammar_text, grammar_name='<?>', import_paths=[]):
"Parse grammar_text, verify, and create Grammar object. Display nice messages on error."

try:
@@ -873,7 +934,7 @@ class GrammarLoader:
aliases = {name: arg1 or name} # Aliases if exist

if path_node.data == 'import_lib': # Import from library
base_paths = []
base_path = None
else: # Relative import
if grammar_name == '<string>': # Import relative to script file path if grammar is coded in script
try:
@@ -883,16 +944,19 @@ class GrammarLoader:
else:
base_file = grammar_name # Import relative to grammar file path if external grammar file
if base_file:
base_paths = [os.path.split(base_file)[0]]
if isinstance(base_file, PackageResource):
base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0])
else:
base_path = os.path.split(base_file)[0]
else:
base_paths = [os.path.abspath(os.path.curdir)]
base_path = os.path.abspath(os.path.curdir)

try:
import_base_paths, import_aliases = imports[dotted_path]
assert base_paths == import_base_paths, 'Inconsistent base_paths for %s.' % '.'.join(dotted_path)
import_base_path, import_aliases = imports[dotted_path]
assert base_path == import_base_path, 'Inconsistent base_path for %s.' % '.'.join(dotted_path)
import_aliases.update(aliases)
except KeyError:
imports[dotted_path] = base_paths, aliases
imports[dotted_path] = base_path, aliases

elif stmt.data == 'declare':
for t in stmt.children:
@@ -901,9 +965,9 @@ class GrammarLoader:
assert False, stmt

# import grammars
for dotted_path, (base_paths, aliases) in imports.items():
for dotted_path, (base_path, aliases) in imports.items():
grammar_path = os.path.join(*dotted_path) + EXT
g = self.import_grammar(grammar_path, base_paths=base_paths)
g = self.import_grammar(grammar_path, base_path=base_path, import_paths=import_paths)
new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases)

term_defs += new_td
@@ -987,5 +1051,5 @@ class GrammarLoader:



def load_grammar(grammar, source, re_, global_keep_all_tokens):
return GrammarLoader(re_, global_keep_all_tokens).load_grammar(grammar, source)
def load_grammar(grammar, source, import_paths, global_keep_all_tokens):
return GrammarLoader(global_keep_all_tokens).load_grammar(grammar, source, import_paths)

+ 32
- 0
tests/test_parser.py Целия файл

@@ -11,6 +11,7 @@ from copy import copy, deepcopy
from lark.utils import Py36, isascii

from lark import Token
from lark.load_grammar import FromPackageLoader

try:
from cStringIO import StringIO as cStringIO
@@ -1805,6 +1806,37 @@ def _make_parser_test(LEXER, PARSER):
tree = parser.parse(test_file)
self.assertEqual(tree.children, [Token('B', 'A')])

def test_import_custom_sources(self):
custom_loader = FromPackageLoader('tests', ('grammars', ))

grammar = """
start: startab

%import ab.startab
"""

p = _Lark(grammar, import_paths=[custom_loader])
self.assertEqual(p.parse('ab'),
Tree('start', [Tree('startab', [Tree('ab__expr', [Token('ab__A', 'a'), Token('ab__B', 'b')])])]))

grammar = """
start: rule_to_import

%import test_relative_import_of_nested_grammar__grammar_to_import.rule_to_import
"""
p = _Lark(grammar, import_paths=[custom_loader])
x = p.parse('N')
self.assertEqual(next(x.find_data('rule_to_import')).children, ['N'])

custom_loader2 = FromPackageLoader('tests')
grammar = """
%import .test_relative_import (start, WS)
%ignore WS
"""
p = _Lark(grammar, import_paths=[custom_loader2])
x = p.parse('12 capybaras')
self.assertEqual(x.children, ['12', 'capybaras'])

@unittest.skipIf(PARSER == 'cyk', "Doesn't work for CYK")
def test_prioritization(self):
"Tests effect of priority on result"


Зареждане…
Отказ
Запис