瀏覽代碼

Merge branch 'grammar_builder' of https://github.com/MegaIng/lark into MegaIng-grammar_builder

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.2
Erez Sh 3 年之前
父節點
當前提交
2f2cebedf8
共有 10 個文件被更改,包括 490 次插入320 次删除
  1. +20
    -1
      docs/grammar.md
  2. +59
    -0
      examples/advanced/grammar_building.py
  3. +1
    -0
      lark-stubs/__init__.pyi
  4. +9
    -0
      lark-stubs/grammar.pyi
  5. +3
    -1
      lark-stubs/lark.pyi
  6. +28
    -0
      lark-stubs/load_grammar.pyi
  7. +1
    -1
      lark/grammars/common.lark
  8. +44
    -39
      lark/lark.py
  9. +290
    -273
      lark/load_grammar.py
  10. +35
    -5
      tests/test_grammar.py

+ 20
- 1
docs/grammar.md 查看文件

@@ -291,7 +291,7 @@ Declare a terminal without defining it. Useful for plugins.

### %override

Override a rule, affecting all the rules that refer to it.
Override a rule or terminals, affecting all references to it, even in imported grammars.

Useful for implementing an inheritance pattern when importing grammars.

@@ -302,3 +302,22 @@ Useful for implementing an inheritance pattern when importing grammars.
// Add hex support to my_grammar
%override number: NUMBER | /0x\w+/
```

### %extend

Extend the definition of a rule or terminal, e.g. add a new option on what it can match, like when separated with `|`.

Useful for splitting up a definition of a complex rule with many different options over multiple files.

Can also be used to implement a plugin system where a core grammar is extended by others.


**Example:**
```perl
%import my_grammar (start, NUMBER)

// Add hex support to my_grammar
%extend NUMBER: /0x\w+/
```

For both `%extend` and `%override`, there is not requirement for a rule/terminal to come from another file, but that is probably the most common usecase

+ 59
- 0
examples/advanced/grammar_building.py 查看文件

@@ -0,0 +1,59 @@
from pathlib import Path

from lark.indenter import Indenter
from lark.lark import Lark
from lark.load_grammar import GrammarBuilder

MATCH_GRAMMAR = ('match', """

%extend compound_stmt: match_stmt

match_stmt: "match" test ":" cases

cases: _NEWLINE _INDENT case+ _DEDENT

case: "case" test ":" suite // test is not quite correct.

""", ('compound_stmt', 'test', 'suite', '_DEDENT', '_INDENT', '_NEWLINE'))

EXTENSIONS = (MATCH_GRAMMAR,)

builder = GrammarBuilder()

builder.load_grammar((Path(__file__).with_name('python3.lark')).read_text(), 'python3')

for name, ext_grammar, needed_names in EXTENSIONS:
mangle = builder.get_mangle(name, dict(zip(needed_names, needed_names)))
builder.load_grammar(ext_grammar, name, mangle)

grammar = builder.build()


class PythonIndenter(Indenter):
NL_type = '_NEWLINE'
OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE']
CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE']
INDENT_type = '_INDENT'
DEDENT_type = '_DEDENT'
tab_len = 8


parser = Lark(grammar, parser='lalr', start=['single_input', 'file_input', 'eval_input'], postlex=PythonIndenter())

tree = parser.parse(r"""

a = 5

def name(n):
match n:
case 1:
print("one")
case 2:
print("two")
case _:
print("number is to big")

name(a)
""", start='file_input')

print(tree.pretty())

+ 1
- 0
lark-stubs/__init__.pyi 查看文件

@@ -4,6 +4,7 @@ from .tree import *
from .visitors import *
from .exceptions import *
from .lexer import *
from .load_grammar import *
from .lark import *
from logging import Logger as _Logger



+ 9
- 0
lark-stubs/grammar.pyi 查看文件

@@ -0,0 +1,9 @@
from typing import Optional, Tuple


class RuleOptions:
keep_all_tokens: bool
expand1: bool
priority: int
template_source: Optional[str]
empty_indices: Tuple[bool, ...]

+ 3
- 1
lark-stubs/lark.pyi 查看文件

@@ -8,6 +8,7 @@ from .visitors import Transformer
from .lexer import Token, Lexer, TerminalDef
from .tree import Tree
from .exceptions import UnexpectedInput
from .load_grammar import Grammar

_T = TypeVar('_T')

@@ -54,13 +55,14 @@ class FromPackageLoader:
class Lark:
source_path: str
source_grammar: str
grammar: Grammar
options: LarkOptions
lexer: Lexer
terminals: List[TerminalDef]

def __init__(
self,
grammar: Union[str, IO[str]],
grammar: Union[Grammar, str, IO[str]],
*,
start: Union[None, str, List[str]] = "start",
parser: Literal["earley", "lalr", "cyk"] = "auto",


+ 28
- 0
lark-stubs/load_grammar.pyi 查看文件

@@ -0,0 +1,28 @@
from typing import List, Tuple, Union, Callable, Dict, Optional

from lark import Tree
from lark.grammar import RuleOptions


class Grammar:
rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions]]
term_defs: List[Tuple[str, Tuple[Tree, int]]]
ignore: List[str]


class GrammarBuilder:
global_keep_all_tokens: bool
import_paths: List[Union[str, Callable]]

def __init__(self, global_keep_all_tokens=..., import_paths=...): ...

def load_grammar(self, grammar_text: str, grammar_name: str = ..., mangle: Callable[[str], str] = None): ...

def do_import(self, dotted_path: Tuple[str, ...], base_path: Optional[str], aliases: Dict[str, str],
base_mangle: Callable[[str], str] = None): ...

def get_mangle(self, prefix: str, aliases: Dict[str, str], base_mangle: Callable[[str], str] = None): ...

def check(self): ...

def build(self) -> Grammar: ...

+ 1
- 1
lark/grammars/common.lark 查看文件

@@ -55,5 +55,5 @@ NEWLINE: (CR? LF)+
// Comments
SH_COMMENT: /#[^\n]*/
CPP_COMMENT: /\/\/[^\n]*/
C_COMMENT: "/*" /.*?/s "*/"
C_COMMENT: "/*" /(.|\n)*?/ "*/"
SQL_COMMENT: /--[^\n]*/

+ 44
- 39
lark/lark.py 查看文件

@@ -7,7 +7,7 @@ import tempfile
from warnings import warn

from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger
from .load_grammar import load_grammar, FromPackageLoader
from .load_grammar import load_grammar, FromPackageLoader, Grammar
from .tree import Tree
from .common import LexerConf, ParserConf

@@ -234,42 +234,50 @@ class Lark(Serialize):
else:
grammar = read()

assert isinstance(grammar, STRING_TYPE)
self.source_grammar = grammar
if self.options.use_bytes:
if not isascii(grammar):
raise ConfigurationError("Grammar must be ascii only, when use_bytes=True")
if sys.version_info[0] == 2 and self.options.use_bytes != 'force':
raise ConfigurationError("`use_bytes=True` may have issues on python2."
"Use `use_bytes='force'` to use it at your own risk.")

cache_fn = None
if self.options.cache:
if self.options.parser != 'lalr':
raise ConfigurationError("cache only works with parser='lalr' for now")
if isinstance(self.options.cache, STRING_TYPE):
cache_fn = self.options.cache
else:
if self.options.cache is not True:
raise ConfigurationError("cache argument must be bool or str")
unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
from . import __version__
options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
s = grammar + options_str + __version__
md5 = hashlib.md5(s.encode()).hexdigest()
cache_fn = tempfile.gettempdir() + '/.lark_cache_%s.tmp' % md5

if FS.exists(cache_fn):
logger.debug('Loading grammar from cache: %s', cache_fn)
# Remove options that aren't relevant for loading from cache
for name in (set(options) - _LOAD_ALLOWED_OPTIONS):
del options[name]
with FS.open(cache_fn, 'rb') as f:
try:
self._load(f, **options)
except Exception:
raise RuntimeError("Failed to load Lark from cache: %r. Try to delete the file and run again." % cache_fn)
return
if isinstance(grammar, STRING_TYPE):
self.source_grammar = grammar
if self.options.use_bytes:
if not isascii(grammar):
raise ConfigurationError("Grammar must be ascii only, when use_bytes=True")
if sys.version_info[0] == 2 and self.options.use_bytes != 'force':
raise ConfigurationError("`use_bytes=True` may have issues on python2."
"Use `use_bytes='force'` to use it at your own risk.")
if self.options.cache:
if self.options.parser != 'lalr':
raise ConfigurationError("cache only works with parser='lalr' for now")
if isinstance(self.options.cache, STRING_TYPE):
cache_fn = self.options.cache
else:
if self.options.cache is not True:
raise ConfigurationError("cache argument must be bool or str")
unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
from . import __version__
options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
s = grammar + options_str + __version__
md5 = hashlib.md5(s.encode()).hexdigest()
cache_fn = tempfile.gettempdir() + '/.lark_cache_%s.tmp' % md5
if FS.exists(cache_fn):
logger.debug('Loading grammar from cache: %s', cache_fn)
# Remove options that aren't relevant for loading from cache
for name in (set(options) - _LOAD_ALLOWED_OPTIONS):
del options[name]
with FS.open(cache_fn, 'rb') as f:
try:
self._load(f, **options)
except Exception:
raise RuntimeError("Failed to load Lark from cache: %r. Try to delete the file and run again." % cache_fn)
return


# Parse the grammar file and compose the grammars
self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
else:
assert isinstance(grammar, Grammar)
self.grammar = grammar

if self.options.lexer == 'auto':
if self.options.parser == 'lalr':
@@ -301,9 +309,6 @@ class Lark(Serialize):
if self.options.ambiguity not in _VALID_AMBIGUITY_OPTIONS:
raise ConfigurationError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS))

# Parse the grammar file and compose the grammars
self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)

if self.options.postlex is not None:
terminals_to_keep = set(self.options.postlex.always_accept)
else:


+ 290
- 273
lark/load_grammar.py 查看文件

@@ -6,6 +6,7 @@ from copy import copy, deepcopy
from io import open
import pkgutil
from ast import literal_eval
from numbers import Integral

from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start
from .lexer import Token, TerminalDef, PatternStr, PatternRE
@@ -95,6 +96,7 @@ TERMINALS = {
'_IGNORE': r'%ignore',
'_OVERRIDE': r'%override',
'_DECLARE': r'%declare',
'_EXTEND': r'%extend',
'_IMPORT': r'%import',
'NUMBER': r'[+-]?\d+',
}
@@ -149,8 +151,11 @@ RULES = {

'term': ['TERMINAL _COLON expansions _NL',
'TERMINAL _DOT NUMBER _COLON expansions _NL'],
'statement': ['ignore', 'import', 'declare', 'override_rule'],
'override_rule': ['_OVERRIDE rule'],
'statement': ['ignore', 'import', 'declare', 'override', 'extend'],
'override': ['_OVERRIDE rule',
'_OVERRIDE term'],
'extend': ['_EXTEND rule',
'_EXTEND term'],
'ignore': ['_IGNORE expansions _NL'],
'declare': ['_DECLARE _declare_args _NL'],
'import': ['_IMPORT _import_path _NL',
@@ -298,15 +303,6 @@ class RuleTreeToText(Transformer):
return expansion, alias.value


@inline_args
class CanonizeTree(Transformer_InPlace):
def tokenmods(self, *args):
if len(args) == 1:
return list(args)
tokenmods, value = args
return tokenmods + [value]


class PrepareAnonTerminals(Transformer_InPlace):
"""Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them"""

@@ -546,10 +542,6 @@ class PrepareSymbols(Transformer_InPlace):
assert False


def _choice_of_rules(rules):
return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules])


def nr_deepcopy_tree(t):
"""Deepcopy tree `t` without recursion"""
return Transformer_NonRecursive(False).transform(t)
@@ -736,58 +728,6 @@ class FromPackageLoader(object):

stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS)

_imported_grammars = {}


def import_from_grammar_into_namespace(grammar, namespace, aliases):
"""Returns all rules and terminals of grammar, prepended
with a 'namespace' prefix, except for those which are aliased.
"""

imported_terms = dict(grammar.term_defs)
imported_rules = {n:(n,p,deepcopy(t),o) for n,p,t,o in grammar.rule_defs}

term_defs = []
rule_defs = []

def rule_dependencies(symbol):
if symbol.type != 'RULE':
return []
try:
_, params, tree,_ = imported_rules[symbol]
except KeyError:
raise GrammarError("Missing symbol '%s' in grammar %s" % (symbol, namespace))
return _find_used_symbols(tree) - set(params)

def get_namespace_name(name, params):
if params is not None:
try:
return params[name]
except KeyError:
pass
try:
return aliases[name].value
except KeyError:
if name[0] == '_':
return '_%s__%s' % (namespace, name[1:])
return '%s__%s' % (namespace, name)

to_import = list(bfs(aliases, rule_dependencies))
for symbol in to_import:
if symbol.type == 'TERMINAL':
term_defs.append([get_namespace_name(symbol, None), imported_terms[symbol]])
else:
assert symbol.type == 'RULE'
_, params, tree, options = imported_rules[symbol]
params_map = {p: ('%s__%s' if p[0]!='_' else '_%s__%s') % (namespace, p) for p in params}
for t in tree.iter_subtrees():
for i, c in enumerate(t.children):
if isinstance(c, Token) and c.type in ('RULE', 'TERMINAL'):
t.children[i] = Token(c.type, get_namespace_name(c, params_map))
params = [params_map[p] for p in params] # We can not rely on ordered dictionaries
rule_defs.append((get_namespace_name(symbol, params_map), params, tree, options))

return term_defs, rule_defs


def resolve_term_references(term_defs):
@@ -859,8 +799,25 @@ def _find_used_symbols(tree):
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}


class GrammarLoader:
ERRORS = [
def _grammar_parser():
try:
return _grammar_parser.cache
except AttributeError:
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]
rules = [options_from_rule(name, None, x) for name, x in RULES.items()]
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o)
for r, _p, xs, o in rules for i, x in enumerate(xs)]
callback = ParseTreeBuilder(rules, ST).create_callback()
import re
lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT'])
parser_conf = ParserConf(rules, callback, ['start'])
lexer_conf.lexer_type = 'standard'
parser_conf.parser_type = 'lalr'
_grammar_parser.cache = ParsingFrontend(lexer_conf, parser_conf, {})
return _grammar_parser.cache

GRAMMAR_ERRORS = [
('Unclosed parenthesis', ['a: (\n']),
('Unmatched closing parenthesis', ['a: )\n', 'a: [)\n', 'a: (]\n']),
('Expecting rule or terminal definition (missing colon)', ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n']),
@@ -874,231 +831,291 @@ class GrammarLoader:
('%ignore expects a value', ['%ignore %import\n']),
]

def __init__(self, global_keep_all_tokens):
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]

rules = [options_from_rule(name, None, x) for name, x in RULES.items()]
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o)
for r, _p, xs, o in rules for i, x in enumerate(xs)]
callback = ParseTreeBuilder(rules, ST).create_callback()
import re
lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT'])
parser_conf = ParserConf(rules, callback, ['start'])
lexer_conf.lexer_type = 'standard'
parser_conf.parser_type = 'lalr'
self.parser = ParsingFrontend(lexer_conf, parser_conf, {})

self.canonize_tree = CanonizeTree()
def _parse_grammar(text, name, start='start'):
try:
return PrepareGrammar().transform(_grammar_parser().parse(text + '\n', start))
except UnexpectedCharacters as e:
context = e.get_context(text)
raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" %
(e.line, e.column, name, context))
except UnexpectedToken as e:
context = e.get_context(text)
error = e.match_examples(_grammar_parser().parse, GRAMMAR_ERRORS, use_accepts=True)
if error:
raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context))
elif 'STRING' in e.expected:
raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context))
raise


class GrammarBuilder:
def __init__(self, global_keep_all_tokens=False, import_paths=None):
self.global_keep_all_tokens = global_keep_all_tokens

def import_grammar(self, grammar_path, base_path=None, import_paths=[]):
if grammar_path not in _imported_grammars:
# import_paths take priority over base_path since they should handle relative imports and ignore everything else.
to_try = import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader]
for source in to_try:
self.import_paths = import_paths or []

self._definitions = {}
self._ignore_names = []
def _is_term(self, name):
# Imported terminals are of the form `Path__to__Grammar__file__TERMINAL_NAME`
# Only the last part is the actual name, and the rest might contain mixed case
return name.rpartition('__')[-1].isupper()

def _grammar_error(self, msg, *names):
args = {}
for i, name in enumerate(names, start=1):
postfix = '' if i == 1 else str(i)
args['name' + postfix] = name
args['type' + postfix] = lowercase_type = ("rule", "terminal")[self._is_term(name)]
args['Type' + postfix] = lowercase_type.title()
raise GrammarError(msg.format(**args))
def _check_options(self, name, options):
if self._is_term(name):
if options is None:
options = 1
# if we don't use Integral here, we run into python2.7/python3 problems with long vs int
elif not isinstance(options, Integral):
raise GrammarError("Terminal require a single int as 'options' (e.g. priority), got %s" % (type(options),))
else:
if options is None:
options = RuleOptions()
elif not isinstance(options, RuleOptions):
raise GrammarError("Rules require a RuleOptions instance as 'options'")
if self.global_keep_all_tokens:
options.keep_all_tokens = True
return options

def _define(self, name, exp, params=(), options=None, override=False):
if (name in self._definitions) ^ override:
if override:
self._grammar_error("Cannot override a nonexisting {type} {name}", name)
else:
self._grammar_error("{Type} '{name}' defined more than once", name)
if name.startswith('__'):
self._grammar_error('Names starting with double-underscore are reserved (Error at {name})', name)
self._definitions[name] = (params, exp, self._check_options(name, options))

def _extend(self, name, exp, params=(), options=None):
if name not in self._definitions:
self._grammar_error("Can't extend {type} {name} as it wasn't defined before", name)
if tuple(params) != tuple(self._definitions[name][0]):
self._grammar_error("Cannot extend {type} with different parameters: {name}", name)
# TODO: think about what to do with 'options'
base = self._definitions[name][1]

while len(base.children) == 2:
assert isinstance(base.children[0], Tree) and base.children[0].data == 'expansions', base
base = base.children[0]
base.children.insert(0, exp)

def _ignore(self, exp_or_name):
if isinstance(exp_or_name, str):
self._ignore_names.append(exp_or_name)
else:
assert isinstance(exp_or_name, Tree)
t = exp_or_name
if t.data == 'expansions' and len(t.children) == 1:
t2 ,= t.children
if t2.data=='expansion' and len(t2.children) == 1:
item ,= t2.children
if item.data == 'value':
item ,= item.children
if isinstance(item, Token) and item.type == 'TERMINAL':
self._ignore_names.append(item.value)
return

name = '__IGNORE_%d'% len(self._ignore_names)
self._ignore_names.append(name)
self._definitions[name] = ((), t, 1)
def _declare(self, *names):
for name in names:
self._define(name, None)
def _mangle_exp(self, exp, mangle):
if mangle is None:
return exp
exp = deepcopy(exp) # TODO: is this needed
for t in exp.iter_subtrees():
for i, c in enumerate(t.children):
if isinstance(c, Token) and c.type in ('RULE', 'TERMINAL'):
t.children[i] = Token(c.type, mangle(c.value))
return exp

def _unpack_definition(self, tree, mangle):
if tree.data == 'rule':
name, params, exp, opts = options_from_rule(*tree.children)
else:
name = tree.children[0].value
params = ()
opts = int(tree.children[1]) if len(tree.children) == 3 else 1 # priority
exp = tree.children[-1]
if mangle is not None:
params = tuple(mangle(p) for p in params)
name = mangle(name)
exp = self._mangle_exp(exp, mangle)
return name, exp, params, opts
def _unpack_import(self, stmt, grammar_name):
if len(stmt.children) > 1:
path_node, arg1 = stmt.children
else:
path_node, = stmt.children
arg1 = None

if isinstance(arg1, Tree): # Multi import
dotted_path = tuple(path_node.children)
names = arg1.children
aliases = dict(zip(names, names)) # Can't have aliased multi import, so all aliases will be the same as names
else: # Single import
dotted_path = tuple(path_node.children[:-1])
name = path_node.children[-1] # Get name from dotted path
aliases = {name.value: (arg1 or name).value} # Aliases if exist

if path_node.data == 'import_lib': # Import from library
base_path = None
else: # Relative import
if grammar_name == '<string>': # Import relative to script file path if grammar is coded in script
try:
if callable(source):
joined_path, text = source(base_path, grammar_path)
else:
joined_path = os.path.join(source, grammar_path)
with open(joined_path, encoding='utf8') as f:
text = f.read()
except IOError:
continue
else:
grammar = self.load_grammar(text, joined_path, import_paths)
_imported_grammars[grammar_path] = grammar
break
base_file = os.path.abspath(sys.modules['__main__'].__file__)
except AttributeError:
base_file = None
else:
# Search failed. Make Python throw a nice error.
open(grammar_path, encoding='utf8')
assert False

return _imported_grammars[grammar_path]

def load_grammar(self, grammar_text, grammar_name='<?>', import_paths=[]):
"""Parse grammar_text, verify, and create Grammar object. Display nice messages on error."""

try:
tree = self.canonize_tree.transform(self.parser.parse(grammar_text+'\n'))
except UnexpectedCharacters as e:
context = e.get_context(grammar_text)
raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" %
(e.line, e.column, grammar_name, context))
except UnexpectedToken as e:
context = e.get_context(grammar_text)
error = e.match_examples(self.parser.parse, self.ERRORS, use_accepts=True)
if error:
raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context))
elif 'STRING' in e.expected:
raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context))
raise

tree = PrepareGrammar().transform(tree)

# Extract grammar items
defs = classify(tree.children, lambda c: c.data, lambda c: c.children)
term_defs = defs.pop('term', [])
rule_defs = defs.pop('rule', [])
statements = defs.pop('statement', [])
assert not defs

term_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in term_defs]
term_defs = [(name.value, (t, int(p))) for name, p, t in term_defs]
rule_defs = [options_from_rule(*x) for x in rule_defs]

# Execute statements
ignore, imports = [], {}
overriding_rules = []
for (stmt,) in statements:
if stmt.data == 'ignore':
t ,= stmt.children
ignore.append(t)
elif stmt.data == 'import':
if len(stmt.children) > 1:
path_node, arg1 = stmt.children
base_file = grammar_name # Import relative to grammar file path if external grammar file
if base_file:
if isinstance(base_file, PackageResource):
base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0])
else:
path_node ,= stmt.children
arg1 = None

if isinstance(arg1, Tree): # Multi import
dotted_path = tuple(path_node.children)
names = arg1.children
aliases = dict(zip(names, names)) # Can't have aliased multi import, so all aliases will be the same as names
else: # Single import
dotted_path = tuple(path_node.children[:-1])
name = path_node.children[-1] # Get name from dotted path
aliases = {name: arg1 or name} # Aliases if exist

if path_node.data == 'import_lib': # Import from library
base_path = None
else: # Relative import
if grammar_name == '<string>': # Import relative to script file path if grammar is coded in script
try:
base_file = os.path.abspath(sys.modules['__main__'].__file__)
except AttributeError:
base_file = None
else:
base_file = grammar_name # Import relative to grammar file path if external grammar file
if base_file:
if isinstance(base_file, PackageResource):
base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0])
else:
base_path = os.path.split(base_file)[0]
else:
base_path = os.path.abspath(os.path.curdir)

base_path = os.path.split(base_file)[0]
else:
base_path = os.path.abspath(os.path.curdir)
return dotted_path, base_path, aliases

def load_grammar(self, grammar_text, grammar_name="<?>", mangle=None):
tree = _parse_grammar(grammar_text, grammar_name)
imports = {} # imports are collect over the whole file to prevent duplications
actions = [] # Some statements need to be delayed (override and extend) till after imports are handled
for stmt in tree.children:
if stmt.data in ('term', 'rule'):
self._define(*self._unpack_definition(stmt, mangle))
continue
assert stmt.data == 'statement', stmt.data
stmt ,= stmt.children
if stmt.data == 'import':
dotted_path, base_path, aliases = self._unpack_import(stmt, grammar_name)
try:
import_base_path, import_aliases = imports[dotted_path]
assert base_path == import_base_path, 'Inconsistent base_path for %s.' % '.'.join(dotted_path)
import_aliases.update(aliases)
except KeyError:
imports[dotted_path] = base_path, aliases

elif stmt.data == 'ignore':
# if mangle is not None, we shouldn't apply ignore, since we aren't in a toplevel grammar
if mangle is None:
self._ignore(*stmt.children)
elif stmt.data == 'declare':
for t in stmt.children:
term_defs.append([t.value, (None, None)])
elif stmt.data == 'override_rule':
if mangle is None:
self._declare(*(t.value for t in stmt.children))
else:
self._declare(*(mangle(t.value) for t in stmt.children))
elif stmt.data == 'override':
r ,= stmt.children
actions.append((self._define, self._unpack_definition(r, mangle) + (True,)))
elif stmt.data == 'extend':
r ,= stmt.children
overriding_rules.append(options_from_rule(*r.children))
actions.append((self._extend, self._unpack_definition(r, mangle)))
else:
assert False, stmt

# import grammars
for dotted_path, (base_path, aliases) in imports.items():
grammar_path = os.path.join(*dotted_path) + EXT
g = self.import_grammar(grammar_path, base_path=base_path, import_paths=import_paths)
new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases)

term_defs += new_td
rule_defs += new_rd

# replace rules by overridding rules, according to name
for r in overriding_rules:
name = r[0]
# remove overridden rule from rule_defs
overridden, rule_defs = classify_bool(rule_defs, lambda r: r[0] == name) # FIXME inefficient
if not overridden:
raise GrammarError("Cannot override a nonexisting rule: %s" % name)
rule_defs.append(r)

## Handle terminals

# Verify correctness 1
for name, _ in term_defs:
if name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)

# Handle ignore tokens
# XXX A slightly hacky solution. Recognition of %ignore TERMINAL as separate comes from the lexer's
# inability to handle duplicate terminals (two names, one value)
ignore_names = []
for t in ignore:
if t.data=='expansions' and len(t.children) == 1:
t2 ,= t.children
if t2.data=='expansion' and len(t2.children) == 1:
item ,= t2.children
if item.data == 'value':
item ,= item.children
if isinstance(item, Token) and item.type == 'TERMINAL':
ignore_names.append(item.value)
continue

name = '__IGNORE_%d'% len(ignore_names)
ignore_names.append(name)
term_defs.append((name, (t, 1)))

# Verify correctness 2
terminal_names = set()
for name, _ in term_defs:
if name in terminal_names:
raise GrammarError("Terminal '%s' defined more than once" % name)
terminal_names.add(name)

if set(ignore_names) > terminal_names:
raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(ignore_names) - terminal_names))

resolve_term_references(term_defs)

## Handle rules

rule_names = {}
for name, params, _x, option in rule_defs:
# We can't just simply not throw away the tokens later, we need option.keep_all_tokens to correctly generate maybe_placeholders
if self.global_keep_all_tokens:
option.keep_all_tokens = True

if name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
if name in rule_names:
raise GrammarError("Rule '%s' defined more than once" % name)
rule_names[name] = len(params)

for name, params , expansions, _o in rule_defs:
self.do_import(dotted_path, base_path, aliases, mangle)
for f, args in actions:
f(*args)
def do_import(self, dotted_path, base_path, aliases, base_mangle=None):
mangle = self.get_mangle('__'.join(dotted_path), aliases, base_mangle)
grammar_path = os.path.join(*dotted_path) + EXT
to_try = self.import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader]
for source in to_try:
try:
if callable(source):
joined_path, text = source(base_path, grammar_path)
else:
joined_path = os.path.join(source, grammar_path)
with open(joined_path, encoding='utf8') as f:
text = f.read()
except IOError:
continue
else:
self.load_grammar(text, joined_path, mangle)
break
else:
# Search failed. Make Python throw a nice error.
open(grammar_path, encoding='utf8')
assert False, "Couldn't import grammar %s, but a corresponding file was found at a place where lark doesn't search for it" % (dotted_path,)

def get_mangle(self, prefix, aliases, base_mangle=None):
def mangle(s):
if s in aliases:
s = aliases[s]
else:
if s[0] == '_':
s = '_%s__%s' % (prefix, s[1:])
else:
s = '%s__%s' % (prefix, s)
if base_mangle is not None:
s = base_mangle(s)
return s
return mangle

def check(self):
for name, (params, exp, options) in self._definitions.items():
for i, p in enumerate(params):
if p in rule_names:
if p in self._definitions:
raise GrammarError("Template Parameter conflicts with rule %s (in template %s)" % (p, name))
if p in params[:i]:
raise GrammarError("Duplicate Template Parameter %s (in template %s)" % (p, name))
for temp in expansions.find_data('template_usage'):
if exp is None: # Remaining checks don't work for abstract rules/terminals
continue

for temp in exp.find_data('template_usage'):
sym = temp.children[0]
args = temp.children[1:]
if sym not in params:
if sym not in rule_names:
raise GrammarError("Template '%s' used but not defined (in rule %s)" % (sym, name))
if len(args) != rule_names[sym]:
raise GrammarError("Wrong number of template arguments used for %s "
"(expected %s, got %s) (in rule %s)" % (sym, rule_names[sym], len(args), name))
for sym in _find_used_symbols(expansions):
if sym.type == 'TERMINAL':
if sym not in terminal_names:
raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name))
else:
if sym not in rule_names and sym not in params:
raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name))

return Grammar(rule_defs, term_defs, ignore_names)

if sym not in self._definitions:
self._grammar_error("Template '%s' used but not defined (in {type} {name})" % sym, name)
if len(args) != len(self._definitions[sym][0]):
expected, actual = len(self._definitions[sym][0]), len(args)
self._grammar_error("Wrong number of template arguments used for {name} "
"(expected %s, got %s) (in {type2} {name2})" % (expected, actual), sym, name)
for sym in _find_used_symbols(exp):
if sym not in self._definitions and sym not in params:
self._grammar_error("{Type} '{name}' used but not defined (in {type2} {name2})", sym, name)

if not set(self._definitions).issuperset(self._ignore_names):
raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(self._ignore_names) - set(self._definitions)))

def build(self):
self.check()
rule_defs = []
term_defs = []
for name, (params, exp, options) in self._definitions.items():
if self._is_term(name):
assert len(params) == 0
term_defs.append((name, (exp, options)))
else:
rule_defs.append((name, params, exp, options))
resolve_term_references(term_defs)
return Grammar(rule_defs, term_defs, self._ignore_names)

def load_grammar(grammar, source, import_paths, global_keep_all_tokens):
return GrammarLoader(global_keep_all_tokens).load_grammar(grammar, source, import_paths)
builder = GrammarBuilder(global_keep_all_tokens, import_paths)
builder.load_grammar(grammar, source)
return builder.build()

+ 35
- 5
tests/test_grammar.py 查看文件

@@ -3,8 +3,8 @@ from __future__ import absolute_import
import sys
from unittest import TestCase, main

from lark import Lark
from lark.load_grammar import GrammarLoader, GrammarError
from lark import Lark, Token, Tree
from lark.load_grammar import GrammarError, GRAMMAR_ERRORS


class TestGrammar(TestCase):
@@ -12,7 +12,7 @@ class TestGrammar(TestCase):
pass

def test_errors(self):
for msg, examples in GrammarLoader.ERRORS:
for msg, examples in GRAMMAR_ERRORS:
for example in examples:
try:
p = Lark(example)
@@ -21,7 +21,7 @@ class TestGrammar(TestCase):
else:
assert False, "example did not raise an error"

def test_override(self):
def test_override_rule(self):
# Overrides the 'sep' template in existing grammar to add an optional terminating delimiter
# Thus extending it beyond its original capacity
p = Lark("""
@@ -29,12 +29,42 @@ class TestGrammar(TestCase):

%override sep{item, delim}: item (delim item)* delim?
%ignore " "
""")
""", source_path=__file__)

a = p.parse('[1, 2, 3]')
b = p.parse('[1, 2, 3, ]')
assert a == b

def test_override_terminal(self):
p = Lark("""
%import .grammars.ab (startab, A, B)
%override A: "c"
%override B: "d"
""", start='startab', source_path=__file__)

a = p.parse('cd')
self.assertEqual(a.children[0].children, [Token('A', 'c'), Token('B', 'd')])

def test_extend_rule(self):
p = Lark("""
%import .grammars.ab (startab, A, B, expr)

%extend expr: B A
""", start='startab', source_path=__file__)
a = p.parse('abab')
self.assertEqual(a.children[0].children, ['a', Tree('expr', ['b', 'a']), 'b'])

def test_extend_term(self):
p = Lark("""
%import .grammars.ab (startab, A, B, expr)
%extend A: "c"
""", start='startab', source_path=__file__)
a = p.parse('acbb')
self.assertEqual(a.children[0].children, ['a', Tree('expr', ['c', 'b']), 'b'])



if __name__ == '__main__':


Loading…
取消
儲存