Browse Source

Merge pull request #816 from lark-parser/MegaIng-grammar_builder

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.2
Erez Shinan 4 years ago
committed by GitHub
parent
commit
f7442d3be6
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 600 additions and 373 deletions
  1. +20
    -1
      docs/grammar.md
  2. +46
    -0
      examples/advanced/extend_python.py
  3. +1
    -0
      lark-stubs/__init__.pyi
  4. +9
    -0
      lark-stubs/grammar.pyi
  5. +3
    -1
      lark-stubs/lark.pyi
  6. +28
    -0
      lark-stubs/load_grammar.pyi
  7. +1
    -1
      lark/grammars/common.lark
  8. +44
    -39
      lark/lark.py
  9. +319
    -268
      lark/load_grammar.py
  10. +129
    -5
      tests/test_grammar.py
  11. +0
    -58
      tests/test_parser.py

+ 20
- 1
docs/grammar.md View File

@@ -291,7 +291,7 @@ Declare a terminal without defining it. Useful for plugins.

### %override

Override a rule, affecting all the rules that refer to it.
Override a rule or terminals, affecting all references to it, even in imported grammars.

Useful for implementing an inheritance pattern when importing grammars.

@@ -302,3 +302,22 @@ Useful for implementing an inheritance pattern when importing grammars.
// Add hex support to my_grammar
%override number: NUMBER | /0x\w+/
```

### %extend

Extend the definition of a rule or terminal, e.g. add a new option on what it can match, like when separated with `|`.

Useful for splitting up a definition of a complex rule with many different options over multiple files.

Can also be used to implement a plugin system where a core grammar is extended by others.


**Example:**
```perl
%import my_grammar (start, NUMBER)

// Add hex support to my_grammar
%extend NUMBER: /0x\w+/
```

For both `%extend` and `%override`, there is not requirement for a rule/terminal to come from another file, but that is probably the most common usecase

+ 46
- 0
examples/advanced/extend_python.py View File

@@ -0,0 +1,46 @@
"""
Extend the Python Grammar
==============================

This example demonstrates how to use the `%extend` statement,
to add new syntax to the example Python grammar.

"""

from lark.lark import Lark
from python_parser import PythonIndenter

GRAMMAR = r"""
%import .python3 (compound_stmt, single_input, file_input, eval_input, test, suite, _NEWLINE, _INDENT, _DEDENT, COMMENT)

%extend compound_stmt: match_stmt

match_stmt: "match" test ":" cases
cases: _NEWLINE _INDENT case+ _DEDENT

case: "case" test ":" suite // test is not quite correct.

%ignore /[\t \f]+/ // WS
%ignore /\\[\t \f]*\r?\n/ // LINE_CONT
%ignore COMMENT
"""

parser = Lark(GRAMMAR, parser='lalr', start=['single_input', 'file_input', 'eval_input'], postlex=PythonIndenter())

tree = parser.parse(r"""
def name(n):
match n:
case 1:
print("one")
case 2:
print("two")
case _:
print("number is too big")

""", start='file_input')

# Remove the 'python3__' prefix that was add to the implicitely imported rules.
for t in tree.iter_subtrees():
t.data = t.data.rsplit('__', 1)[-1]

print(tree.pretty())

+ 1
- 0
lark-stubs/__init__.pyi View File

@@ -4,6 +4,7 @@ from .tree import *
from .visitors import *
from .exceptions import *
from .lexer import *
from .load_grammar import *
from .lark import *
from logging import Logger as _Logger



+ 9
- 0
lark-stubs/grammar.pyi View File

@@ -0,0 +1,9 @@
from typing import Optional, Tuple


class RuleOptions:
keep_all_tokens: bool
expand1: bool
priority: int
template_source: Optional[str]
empty_indices: Tuple[bool, ...]

+ 3
- 1
lark-stubs/lark.pyi View File

@@ -8,6 +8,7 @@ from .visitors import Transformer
from .lexer import Token, Lexer, TerminalDef
from .tree import Tree
from .exceptions import UnexpectedInput
from .load_grammar import Grammar

_T = TypeVar('_T')

@@ -54,13 +55,14 @@ class FromPackageLoader:
class Lark:
source_path: str
source_grammar: str
grammar: Grammar
options: LarkOptions
lexer: Lexer
terminals: List[TerminalDef]

def __init__(
self,
grammar: Union[str, IO[str]],
grammar: Union[Grammar, str, IO[str]],
*,
start: Union[None, str, List[str]] = "start",
parser: Literal["earley", "lalr", "cyk"] = "auto",


+ 28
- 0
lark-stubs/load_grammar.pyi View File

@@ -0,0 +1,28 @@
from typing import List, Tuple, Union, Callable, Dict, Optional

from lark import Tree
from lark.grammar import RuleOptions


class Grammar:
rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions]]
term_defs: List[Tuple[str, Tuple[Tree, int]]]
ignore: List[str]


class GrammarBuilder:
global_keep_all_tokens: bool
import_paths: List[Union[str, Callable]]

def __init__(self, global_keep_all_tokens=..., import_paths=...): ...

def load_grammar(self, grammar_text: str, grammar_name: str = ..., mangle: Callable[[str], str] = None): ...

def do_import(self, dotted_path: Tuple[str, ...], base_path: Optional[str], aliases: Dict[str, str],
base_mangle: Callable[[str], str] = None): ...

def get_mangle(self, prefix: str, aliases: Dict[str, str], base_mangle: Callable[[str], str] = None): ...

def check(self): ...

def build(self) -> Grammar: ...

+ 1
- 1
lark/grammars/common.lark View File

@@ -55,5 +55,5 @@ NEWLINE: (CR? LF)+
// Comments
SH_COMMENT: /#[^\n]*/
CPP_COMMENT: /\/\/[^\n]*/
C_COMMENT: "/*" /.*?/s "*/"
C_COMMENT: "/*" /(.|\n)*?/ "*/"
SQL_COMMENT: /--[^\n]*/

+ 44
- 39
lark/lark.py View File

@@ -7,7 +7,7 @@ import tempfile
from warnings import warn

from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger
from .load_grammar import load_grammar, FromPackageLoader
from .load_grammar import load_grammar, FromPackageLoader, Grammar
from .tree import Tree
from .common import LexerConf, ParserConf

@@ -234,42 +234,50 @@ class Lark(Serialize):
else:
grammar = read()

assert isinstance(grammar, STRING_TYPE)
self.source_grammar = grammar
if self.options.use_bytes:
if not isascii(grammar):
raise ConfigurationError("Grammar must be ascii only, when use_bytes=True")
if sys.version_info[0] == 2 and self.options.use_bytes != 'force':
raise ConfigurationError("`use_bytes=True` may have issues on python2."
"Use `use_bytes='force'` to use it at your own risk.")

cache_fn = None
if self.options.cache:
if self.options.parser != 'lalr':
raise ConfigurationError("cache only works with parser='lalr' for now")
if isinstance(self.options.cache, STRING_TYPE):
cache_fn = self.options.cache
else:
if self.options.cache is not True:
raise ConfigurationError("cache argument must be bool or str")
unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
from . import __version__
options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
s = grammar + options_str + __version__
md5 = hashlib.md5(s.encode()).hexdigest()
cache_fn = tempfile.gettempdir() + '/.lark_cache_%s.tmp' % md5

if FS.exists(cache_fn):
logger.debug('Loading grammar from cache: %s', cache_fn)
# Remove options that aren't relevant for loading from cache
for name in (set(options) - _LOAD_ALLOWED_OPTIONS):
del options[name]
with FS.open(cache_fn, 'rb') as f:
try:
self._load(f, **options)
except Exception:
raise RuntimeError("Failed to load Lark from cache: %r. Try to delete the file and run again." % cache_fn)
return
if isinstance(grammar, STRING_TYPE):
self.source_grammar = grammar
if self.options.use_bytes:
if not isascii(grammar):
raise ConfigurationError("Grammar must be ascii only, when use_bytes=True")
if sys.version_info[0] == 2 and self.options.use_bytes != 'force':
raise ConfigurationError("`use_bytes=True` may have issues on python2."
"Use `use_bytes='force'` to use it at your own risk.")
if self.options.cache:
if self.options.parser != 'lalr':
raise ConfigurationError("cache only works with parser='lalr' for now")
if isinstance(self.options.cache, STRING_TYPE):
cache_fn = self.options.cache
else:
if self.options.cache is not True:
raise ConfigurationError("cache argument must be bool or str")
unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
from . import __version__
options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
s = grammar + options_str + __version__
md5 = hashlib.md5(s.encode()).hexdigest()
cache_fn = tempfile.gettempdir() + '/.lark_cache_%s.tmp' % md5
if FS.exists(cache_fn):
logger.debug('Loading grammar from cache: %s', cache_fn)
# Remove options that aren't relevant for loading from cache
for name in (set(options) - _LOAD_ALLOWED_OPTIONS):
del options[name]
with FS.open(cache_fn, 'rb') as f:
try:
self._load(f, **options)
except Exception:
raise RuntimeError("Failed to load Lark from cache: %r. Try to delete the file and run again." % cache_fn)
return


# Parse the grammar file and compose the grammars
self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
else:
assert isinstance(grammar, Grammar)
self.grammar = grammar

if self.options.lexer == 'auto':
if self.options.parser == 'lalr':
@@ -301,9 +309,6 @@ class Lark(Serialize):
if self.options.ambiguity not in _VALID_AMBIGUITY_OPTIONS:
raise ConfigurationError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS))

# Parse the grammar file and compose the grammars
self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)

if self.options.postlex is not None:
terminals_to_keep = set(self.options.postlex.always_accept)
else:


+ 319
- 268
lark/load_grammar.py View File

@@ -6,6 +6,7 @@ from copy import copy, deepcopy
from io import open
import pkgutil
from ast import literal_eval
from numbers import Integral

from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start
from .lexer import Token, TerminalDef, PatternStr, PatternRE
@@ -95,6 +96,7 @@ TERMINALS = {
'_IGNORE': r'%ignore',
'_OVERRIDE': r'%override',
'_DECLARE': r'%declare',
'_EXTEND': r'%extend',
'_IMPORT': r'%import',
'NUMBER': r'[+-]?\d+',
}
@@ -102,7 +104,7 @@ TERMINALS = {
RULES = {
'start': ['_list'],
'_list': ['_item', '_list _item'],
'_item': ['rule', 'term', 'statement', '_NL'],
'_item': ['rule', 'term', 'ignore', 'import', 'declare', 'override', 'extend', '_NL'],

'rule': ['RULE template_params _COLON expansions _NL',
'RULE template_params _DOT NUMBER _COLON expansions _NL'],
@@ -149,8 +151,10 @@ RULES = {

'term': ['TERMINAL _COLON expansions _NL',
'TERMINAL _DOT NUMBER _COLON expansions _NL'],
'statement': ['ignore', 'import', 'declare', 'override_rule'],
'override_rule': ['_OVERRIDE rule'],
'override': ['_OVERRIDE rule',
'_OVERRIDE term'],
'extend': ['_EXTEND rule',
'_EXTEND term'],
'ignore': ['_IGNORE expansions _NL'],
'declare': ['_DECLARE _declare_args _NL'],
'import': ['_IMPORT _import_path _NL',
@@ -298,15 +302,6 @@ class RuleTreeToText(Transformer):
return expansion, alias.value


@inline_args
class CanonizeTree(Transformer_InPlace):
def tokenmods(self, *args):
if len(args) == 1:
return list(args)
tokenmods, value = args
return tokenmods + [value]


class PrepareAnonTerminals(Transformer_InPlace):
"""Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them"""

@@ -546,10 +541,6 @@ class PrepareSymbols(Transformer_InPlace):
assert False


def _choice_of_rules(rules):
return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules])


def nr_deepcopy_tree(t):
"""Deepcopy tree `t` without recursion"""
return Transformer_NonRecursive(False).transform(t)
@@ -736,69 +727,14 @@ class FromPackageLoader(object):

stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS)

_imported_grammars = {}


def import_from_grammar_into_namespace(grammar, namespace, aliases):
"""Returns all rules and terminals of grammar, prepended
with a 'namespace' prefix, except for those which are aliased.
"""

imported_terms = dict(grammar.term_defs)
imported_rules = {n:(n,p,deepcopy(t),o) for n,p,t,o in grammar.rule_defs}

term_defs = []
rule_defs = []

def rule_dependencies(symbol):
if symbol.type != 'RULE':
return []
try:
_, params, tree,_ = imported_rules[symbol]
except KeyError:
raise GrammarError("Missing symbol '%s' in grammar %s" % (symbol, namespace))
return _find_used_symbols(tree) - set(params)

def get_namespace_name(name, params):
if params is not None:
try:
return params[name]
except KeyError:
pass
try:
return aliases[name].value
except KeyError:
if name[0] == '_':
return '_%s__%s' % (namespace, name[1:])
return '%s__%s' % (namespace, name)

to_import = list(bfs(aliases, rule_dependencies))
for symbol in to_import:
if symbol.type == 'TERMINAL':
term_defs.append([get_namespace_name(symbol, None), imported_terms[symbol]])
else:
assert symbol.type == 'RULE'
_, params, tree, options = imported_rules[symbol]
params_map = {p: ('%s__%s' if p[0]!='_' else '_%s__%s') % (namespace, p) for p in params}
for t in tree.iter_subtrees():
for i, c in enumerate(t.children):
if isinstance(c, Token) and c.type in ('RULE', 'TERMINAL'):
t.children[i] = Token(c.type, get_namespace_name(c, params_map))
params = [params_map[p] for p in params] # We can not rely on ordered dictionaries
rule_defs.append((get_namespace_name(symbol, params_map), params, tree, options))

return term_defs, rule_defs


def resolve_term_references(term_defs):
def resolve_term_references(term_dict):
# TODO Solve with transitive closure (maybe)

term_dict = {k:t for k, (t,_p) in term_defs}
assert len(term_dict) == len(term_defs), "Same name defined twice?"

while True:
changed = False
for name, (token_tree, _p) in term_defs:
for name, token_tree in term_dict.items():
if token_tree is None: # Terminal added through %declare
continue
for exp in token_tree.find_data('value'):
@@ -859,8 +795,25 @@ def _find_used_symbols(tree):
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}


class GrammarLoader:
ERRORS = [
def _get_parser():
try:
return _get_parser.cache
except AttributeError:
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]

rules = [options_from_rule(name, None, x) for name, x in RULES.items()]
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o)
for r, _p, xs, o in rules for i, x in enumerate(xs)]
callback = ParseTreeBuilder(rules, ST).create_callback()
import re
lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT'])
parser_conf = ParserConf(rules, callback, ['start'])
lexer_conf.lexer_type = 'standard'
parser_conf.parser_type = 'lalr'
_get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, {})
return _get_parser.cache

GRAMMAR_ERRORS = [
('Unclosed parenthesis', ['a: (\n']),
('Unmatched closing parenthesis', ['a: )\n', 'a: [)\n', 'a: (]\n']),
('Expecting rule or terminal definition (missing colon)', ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n']),
@@ -874,120 +827,202 @@ class GrammarLoader:
('%ignore expects a value', ['%ignore %import\n']),
]

def __init__(self, global_keep_all_tokens):
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]
def _parse_grammar(text, name, start='start'):
try:
tree = _get_parser().parse(text + '\n', start)
except UnexpectedCharacters as e:
context = e.get_context(text)
raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" %
(e.line, e.column, name, context))
except UnexpectedToken as e:
context = e.get_context(text)
error = e.match_examples(_get_parser().parse, GRAMMAR_ERRORS, use_accepts=True)
if error:
raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context))
elif 'STRING' in e.expected:
raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context))
raise

return PrepareGrammar().transform(tree)


def _get_mangle(prefix, aliases, base_mangle=None):
def mangle(s):
if s in aliases:
s = aliases[s]
else:
if s[0] == '_':
s = '_%s__%s' % (prefix, s[1:])
else:
s = '%s__%s' % (prefix, s)
if base_mangle is not None:
s = base_mangle(s)
return s
return mangle

def _mangle_exp(exp, mangle):
if mangle is None:
return exp
exp = deepcopy(exp) # TODO: is this needed
for t in exp.iter_subtrees():
for i, c in enumerate(t.children):
if isinstance(c, Token) and c.type in ('RULE', 'TERMINAL'):
t.children[i] = Token(c.type, mangle(c.value))
return exp



class GrammarBuilder:
def __init__(self, global_keep_all_tokens=False, import_paths=None):
self.global_keep_all_tokens = global_keep_all_tokens
self.import_paths = import_paths or []

self._definitions = {}
self._ignore_names = []

def _is_term(self, name):
# Imported terminals are of the form `Path__to__Grammar__file__TERMINAL_NAME`
# Only the last part is the actual name, and the rest might contain mixed case
return name.rpartition('__')[-1].isupper()

def _grammar_error(self, msg, *names):
args = {}
for i, name in enumerate(names, start=1):
postfix = '' if i == 1 else str(i)
args['name' + postfix] = name
args['type' + postfix] = lowercase_type = ("rule", "terminal")[self._is_term(name)]
args['Type' + postfix] = lowercase_type.title()
raise GrammarError(msg.format(**args))

def _check_options(self, name, options):
if self._is_term(name):
if options is None:
options = 1
# if we don't use Integral here, we run into python2.7/python3 problems with long vs int
elif not isinstance(options, Integral):
raise GrammarError("Terminal require a single int as 'options' (e.g. priority), got %s" % (type(options),))
else:
if options is None:
options = RuleOptions()
elif not isinstance(options, RuleOptions):
raise GrammarError("Rules require a RuleOptions instance as 'options'")
if self.global_keep_all_tokens:
options.keep_all_tokens = True
return options


def _define(self, name, exp, params=(), options=None, override=False):
if name in self._definitions:
if not override:
self._grammar_error("{Type} '{name}' defined more than once", name)
elif override:
self._grammar_error("Cannot override a nonexisting {type} {name}", name)

if name.startswith('__'):
self._grammar_error('Names starting with double-underscore are reserved (Error at {name})', name)

self._definitions[name] = (params, exp, self._check_options(name, options))

def _extend(self, name, exp, params=(), options=None):
if name not in self._definitions:
self._grammar_error("Can't extend {type} {name} as it wasn't defined before", name)
if tuple(params) != tuple(self._definitions[name][0]):
self._grammar_error("Cannot extend {type} with different parameters: {name}", name)
# TODO: think about what to do with 'options'
base = self._definitions[name][1]

while len(base.children) == 2:
assert isinstance(base.children[0], Tree) and base.children[0].data == 'expansions', base
base = base.children[0]
base.children.insert(0, exp)

def _ignore(self, exp_or_name):
if isinstance(exp_or_name, str):
self._ignore_names.append(exp_or_name)
else:
assert isinstance(exp_or_name, Tree)
t = exp_or_name
if t.data == 'expansions' and len(t.children) == 1:
t2 ,= t.children
if t2.data=='expansion' and len(t2.children) == 1:
item ,= t2.children
if item.data == 'value':
item ,= item.children
if isinstance(item, Token) and item.type == 'TERMINAL':
self._ignore_names.append(item.value)
return

rules = [options_from_rule(name, None, x) for name, x in RULES.items()]
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o)
for r, _p, xs, o in rules for i, x in enumerate(xs)]
callback = ParseTreeBuilder(rules, ST).create_callback()
import re
lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT'])
parser_conf = ParserConf(rules, callback, ['start'])
lexer_conf.lexer_type = 'standard'
parser_conf.parser_type = 'lalr'
self.parser = ParsingFrontend(lexer_conf, parser_conf, {})
name = '__IGNORE_%d'% len(self._ignore_names)
self._ignore_names.append(name)
self._definitions[name] = ((), t, 1)

self.canonize_tree = CanonizeTree()
self.global_keep_all_tokens = global_keep_all_tokens
def _declare(self, *names):
for name in names:
self._define(name, None)

def import_grammar(self, grammar_path, base_path=None, import_paths=[]):
if grammar_path not in _imported_grammars:
# import_paths take priority over base_path since they should handle relative imports and ignore everything else.
to_try = import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader]
for source in to_try:
def _unpack_import(self, stmt, grammar_name):
if len(stmt.children) > 1:
path_node, arg1 = stmt.children
else:
path_node, = stmt.children
arg1 = None

if isinstance(arg1, Tree): # Multi import
dotted_path = tuple(path_node.children)
names = arg1.children
aliases = dict(zip(names, names)) # Can't have aliased multi import, so all aliases will be the same as names
else: # Single import
dotted_path = tuple(path_node.children[:-1])
if not dotted_path:
name ,= path_node.children
raise GrammarError("Nothing was imported from grammar `%s`" % name)
name = path_node.children[-1] # Get name from dotted path
aliases = {name.value: (arg1 or name).value} # Aliases if exist

if path_node.data == 'import_lib': # Import from library
base_path = None
else: # Relative import
if grammar_name == '<string>': # Import relative to script file path if grammar is coded in script
try:
if callable(source):
joined_path, text = source(base_path, grammar_path)
else:
joined_path = os.path.join(source, grammar_path)
with open(joined_path, encoding='utf8') as f:
text = f.read()
except IOError:
continue
else:
grammar = self.load_grammar(text, joined_path, import_paths)
_imported_grammars[grammar_path] = grammar
break
base_file = os.path.abspath(sys.modules['__main__'].__file__)
except AttributeError:
base_file = None
else:
# Search failed. Make Python throw a nice error.
open(grammar_path, encoding='utf8')
assert False

return _imported_grammars[grammar_path]

def load_grammar(self, grammar_text, grammar_name='<?>', import_paths=[]):
"""Parse grammar_text, verify, and create Grammar object. Display nice messages on error."""

try:
tree = self.canonize_tree.transform(self.parser.parse(grammar_text+'\n'))
except UnexpectedCharacters as e:
context = e.get_context(grammar_text)
raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" %
(e.line, e.column, grammar_name, context))
except UnexpectedToken as e:
context = e.get_context(grammar_text)
error = e.match_examples(self.parser.parse, self.ERRORS, use_accepts=True)
if error:
raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context))
elif 'STRING' in e.expected:
raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context))
raise

tree = PrepareGrammar().transform(tree)

# Extract grammar items
defs = classify(tree.children, lambda c: c.data, lambda c: c.children)
term_defs = defs.pop('term', [])
rule_defs = defs.pop('rule', [])
statements = defs.pop('statement', [])
assert not defs

term_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in term_defs]
term_defs = [(name.value, (t, int(p))) for name, p, t in term_defs]
rule_defs = [options_from_rule(*x) for x in rule_defs]

# Execute statements
ignore, imports = [], {}
overriding_rules = []
for (stmt,) in statements:
if stmt.data == 'ignore':
t ,= stmt.children
ignore.append(t)
elif stmt.data == 'import':
if len(stmt.children) > 1:
path_node, arg1 = stmt.children
base_file = grammar_name # Import relative to grammar file path if external grammar file
if base_file:
if isinstance(base_file, PackageResource):
base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0])
else:
path_node ,= stmt.children
arg1 = None

if isinstance(arg1, Tree): # Multi import
dotted_path = tuple(path_node.children)
names = arg1.children
aliases = dict(zip(names, names)) # Can't have aliased multi import, so all aliases will be the same as names
else: # Single import
dotted_path = tuple(path_node.children[:-1])
name = path_node.children[-1] # Get name from dotted path
aliases = {name: arg1 or name} # Aliases if exist

if path_node.data == 'import_lib': # Import from library
base_path = None
else: # Relative import
if grammar_name == '<string>': # Import relative to script file path if grammar is coded in script
try:
base_file = os.path.abspath(sys.modules['__main__'].__file__)
except AttributeError:
base_file = None
else:
base_file = grammar_name # Import relative to grammar file path if external grammar file
if base_file:
if isinstance(base_file, PackageResource):
base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0])
else:
base_path = os.path.split(base_file)[0]
else:
base_path = os.path.abspath(os.path.curdir)
base_path = os.path.split(base_file)[0]
else:
base_path = os.path.abspath(os.path.curdir)

return dotted_path, base_path, aliases

def _unpack_definition(self, tree, mangle):
if tree.data == 'rule':
name, params, exp, opts = options_from_rule(*tree.children)
else:
name = tree.children[0].value
params = () # TODO terminal templates
opts = int(tree.children[1]) if len(tree.children) == 3 else 1 # priority
exp = tree.children[-1]

if mangle is not None:
params = tuple(mangle(p) for p in params)
name = mangle(name)

exp = _mangle_exp(exp, mangle)
return name, exp, params, opts


def load_grammar(self, grammar_text, grammar_name="<?>", mangle=None, dotted_path=None):
tree = _parse_grammar(grammar_text, grammar_name)

imports = {}
for stmt in tree.children:
if stmt.data == 'import':
dotted_path, base_path, aliases = self._unpack_import(stmt, grammar_name)
try:
import_base_path, import_aliases = imports[dotted_path]
assert base_path == import_base_path, 'Inconsistent base_path for %s.' % '.'.join(dotted_path)
@@ -995,110 +1030,126 @@ class GrammarLoader:
except KeyError:
imports[dotted_path] = base_path, aliases

elif stmt.data == 'declare':
for t in stmt.children:
term_defs.append([t.value, (None, None)])
elif stmt.data == 'override_rule':
for dotted_path, (base_path, aliases) in imports.items():
self.do_import(dotted_path, base_path, aliases, mangle)

for stmt in tree.children:
if stmt.data in ('term', 'rule'):
self._define(*self._unpack_definition(stmt, mangle))
elif stmt.data == 'override':
r ,= stmt.children
self._define(*self._unpack_definition(r, mangle), override=True)
elif stmt.data == 'extend':
r ,= stmt.children
overriding_rules.append(options_from_rule(*r.children))
self._extend(*self._unpack_definition(r, mangle))
elif stmt.data == 'ignore':
# if mangle is not None, we shouldn't apply ignore, since we aren't in a toplevel grammar
if mangle is None:
self._ignore(*stmt.children)
elif stmt.data == 'declare':
names = [t.value for t in stmt.children]
if mangle is None:
self._declare(*names)
else:
self._declare(*map(mangle, names))
elif stmt.data == 'import':
pass
else:
assert False, stmt

# import grammars
for dotted_path, (base_path, aliases) in imports.items():
grammar_path = os.path.join(*dotted_path) + EXT
g = self.import_grammar(grammar_path, base_path=base_path, import_paths=import_paths)
new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases)

term_defs += new_td
rule_defs += new_rd

# replace rules by overridding rules, according to name
for r in overriding_rules:
name = r[0]
# remove overridden rule from rule_defs
overridden, rule_defs = classify_bool(rule_defs, lambda r: r[0] == name) # FIXME inefficient
if not overridden:
raise GrammarError("Cannot override a nonexisting rule: %s" % name)
rule_defs.append(r)

## Handle terminals

# Verify correctness 1
for name, _ in term_defs:
if name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)

# Handle ignore tokens
# XXX A slightly hacky solution. Recognition of %ignore TERMINAL as separate comes from the lexer's
# inability to handle duplicate terminals (two names, one value)
ignore_names = []
for t in ignore:
if t.data=='expansions' and len(t.children) == 1:
t2 ,= t.children
if t2.data=='expansion' and len(t2.children) == 1:
item ,= t2.children
if item.data == 'value':
item ,= item.children
if isinstance(item, Token) and item.type == 'TERMINAL':
ignore_names.append(item.value)
continue

name = '__IGNORE_%d'% len(ignore_names)
ignore_names.append(name)
term_defs.append((name, (t, 1)))
term_defs = { name: exp
for name, (_params, exp, _options) in self._definitions.items()
if self._is_term(name)
}
resolve_term_references(term_defs)

# Verify correctness 2
terminal_names = set()
for name, _ in term_defs:
if name in terminal_names:
raise GrammarError("Terminal '%s' defined more than once" % name)
terminal_names.add(name)

if set(ignore_names) > terminal_names:
raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(ignore_names) - terminal_names))
def _remove_unused(self, used):
def rule_dependencies(symbol):
if self._is_term(symbol):
return []
params, tree,_ = self._definitions[symbol]
return _find_used_symbols(tree) - set(params)

resolve_term_references(term_defs)
_used = set(bfs(used, rule_dependencies))
self._definitions = {k: v for k, v in self._definitions.items() if k in _used}

## Handle rules

rule_names = {}
for name, params, _x, option in rule_defs:
# We can't just simply not throw away the tokens later, we need option.keep_all_tokens to correctly generate maybe_placeholders
if self.global_keep_all_tokens:
option.keep_all_tokens = True
def do_import(self, dotted_path, base_path, aliases, base_mangle=None):
assert dotted_path
mangle = _get_mangle('__'.join(dotted_path), aliases, base_mangle)
grammar_path = os.path.join(*dotted_path) + EXT
to_try = self.import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader]
for source in to_try:
try:
if callable(source):
joined_path, text = source(base_path, grammar_path)
else:
joined_path = os.path.join(source, grammar_path)
with open(joined_path, encoding='utf8') as f:
text = f.read()
except IOError:
continue
else:
gb = GrammarBuilder(self.global_keep_all_tokens, self.import_paths)
gb.load_grammar(text, joined_path, mangle, dotted_path)
gb._remove_unused(map(mangle, aliases))
for name in gb._definitions:
if name in self._definitions:
raise GrammarError("Cannot import '%s' from '%s': Symbol already defined." % (name, grammar_path))

self._definitions.update(**gb._definitions)
break
else:
# Search failed. Make Python throw a nice error.
open(grammar_path, encoding='utf8')
assert False, "Couldn't import grammar %s, but a corresponding file was found at a place where lark doesn't search for it" % (dotted_path,)

if name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
if name in rule_names:
raise GrammarError("Rule '%s' defined more than once" % name)
rule_names[name] = len(params)

for name, params , expansions, _o in rule_defs:
def validate(self):
for name, (params, exp, _options) in self._definitions.items():
for i, p in enumerate(params):
if p in rule_names:
if p in self._definitions:
raise GrammarError("Template Parameter conflicts with rule %s (in template %s)" % (p, name))
if p in params[:i]:
raise GrammarError("Duplicate Template Parameter %s (in template %s)" % (p, name))
for temp in expansions.find_data('template_usage'):

if exp is None: # Remaining checks don't apply to abstract rules/terminals
continue

for temp in exp.find_data('template_usage'):
sym = temp.children[0]
args = temp.children[1:]
if sym not in params:
if sym not in rule_names:
raise GrammarError("Template '%s' used but not defined (in rule %s)" % (sym, name))
if len(args) != rule_names[sym]:
raise GrammarError("Wrong number of template arguments used for %s "
"(expected %s, got %s) (in rule %s)" % (sym, rule_names[sym], len(args), name))
for sym in _find_used_symbols(expansions):
if sym.type == 'TERMINAL':
if sym not in terminal_names:
raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name))
else:
if sym not in rule_names and sym not in params:
raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name))

return Grammar(rule_defs, term_defs, ignore_names)

if sym not in self._definitions:
self._grammar_error("Template '%s' used but not defined (in {type} {name})" % sym, name)
if len(args) != len(self._definitions[sym][0]):
expected, actual = len(self._definitions[sym][0]), len(args)
self._grammar_error("Wrong number of template arguments used for {name} "
"(expected %s, got %s) (in {type2} {name2})" % (expected, actual), sym, name)

for sym in _find_used_symbols(exp):
if sym not in self._definitions and sym not in params:
self._grammar_error("{Type} '{name}' used but not defined (in {type2} {name2})", sym, name)

if not set(self._definitions).issuperset(self._ignore_names):
raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(self._ignore_names) - set(self._definitions)))

def build(self):
self.validate()
rule_defs = []
term_defs = []
for name, (params, exp, options) in self._definitions.items():
if self._is_term(name):
assert len(params) == 0
term_defs.append((name, (exp, options)))
else:
rule_defs.append((name, params, exp, options))
# resolve_term_references(term_defs)
return Grammar(rule_defs, term_defs, self._ignore_names)

def load_grammar(grammar, source, import_paths, global_keep_all_tokens):
return GrammarLoader(global_keep_all_tokens).load_grammar(grammar, source, import_paths)
builder = GrammarBuilder(global_keep_all_tokens, import_paths)
builder.load_grammar(grammar, source)
return builder.build()

+ 129
- 5
tests/test_grammar.py View File

@@ -3,8 +3,9 @@ from __future__ import absolute_import
import sys
from unittest import TestCase, main

from lark import Lark
from lark.load_grammar import GrammarLoader, GrammarError
from lark import Lark, Token, Tree
from lark.load_grammar import GrammarError, GRAMMAR_ERRORS
from lark.load_grammar import FromPackageLoader


class TestGrammar(TestCase):
@@ -12,7 +13,7 @@ class TestGrammar(TestCase):
pass

def test_errors(self):
for msg, examples in GrammarLoader.ERRORS:
for msg, examples in GRAMMAR_ERRORS:
for example in examples:
try:
p = Lark(example)
@@ -21,7 +22,7 @@ class TestGrammar(TestCase):
else:
assert False, "example did not raise an error"

def test_override(self):
def test_override_rule(self):
# Overrides the 'sep' template in existing grammar to add an optional terminating delimiter
# Thus extending it beyond its original capacity
p = Lark("""
@@ -29,12 +30,135 @@ class TestGrammar(TestCase):

%override sep{item, delim}: item (delim item)* delim?
%ignore " "
""")
""", source_path=__file__)

a = p.parse('[1, 2, 3]')
b = p.parse('[1, 2, 3, ]')
assert a == b

self.assertRaises(GrammarError, Lark, """
%import .test_templates_import (start, sep)

%override sep{item}: item (delim item)* delim?
""")

self.assertRaises(GrammarError, Lark, """
%override sep{item}: item (delim item)* delim?
""")

def test_override_terminal(self):
p = Lark("""

%import .grammars.ab (startab, A, B)

%override A: "c"
%override B: "d"
""", start='startab', source_path=__file__)

a = p.parse('cd')
self.assertEqual(a.children[0].children, [Token('A', 'c'), Token('B', 'd')])

def test_extend_rule(self):
p = Lark("""
%import .grammars.ab (startab, A, B, expr)

%extend expr: B A
""", start='startab', source_path=__file__)
a = p.parse('abab')
self.assertEqual(a.children[0].children, ['a', Tree('expr', ['b', 'a']), 'b'])

self.assertRaises(GrammarError, Lark, """
%extend expr: B A
""")

def test_extend_term(self):
p = Lark("""
%import .grammars.ab (startab, A, B, expr)

%extend A: "c"
""", start='startab', source_path=__file__)
a = p.parse('acbb')
self.assertEqual(a.children[0].children, ['a', Tree('expr', ['c', 'b']), 'b'])

def test_extend_twice(self):
p = Lark("""
start: x+

x: "a"
%extend x: "b"
%extend x: "c"
""")

assert p.parse("abccbba") == p.parse("cbabbbb")

def test_undefined_ignore(self):
g = """!start: "A"

%ignore B
"""
self.assertRaises( GrammarError, Lark, g)

g = """!start: "A"

%ignore start
"""
self.assertRaises( GrammarError, Lark, g)

def test_alias_in_terminal(self):
g = """start: TERM
TERM: "a" -> alias
"""
self.assertRaises( GrammarError, Lark, g)

def test_undefined_rule(self):
self.assertRaises(GrammarError, Lark, """start: a""")

def test_undefined_term(self):
self.assertRaises(GrammarError, Lark, """start: A""")

def test_token_multiline_only_works_with_x_flag(self):
g = r"""start: ABC
ABC: / a b c
d
e f
/i
"""
self.assertRaises( GrammarError, Lark, g)

def test_import_custom_sources(self):
custom_loader = FromPackageLoader('tests', ('grammars', ))

grammar = """
start: startab

%import ab.startab
"""

p = Lark(grammar, import_paths=[custom_loader])
self.assertEqual(p.parse('ab'),
Tree('start', [Tree('startab', [Tree('ab__expr', [Token('ab__A', 'a'), Token('ab__B', 'b')])])]))

def test_import_custom_sources2(self):
custom_loader = FromPackageLoader('tests', ('grammars', ))

grammar = """
start: rule_to_import

%import test_relative_import_of_nested_grammar__grammar_to_import.rule_to_import
"""
p = Lark(grammar, import_paths=[custom_loader])
x = p.parse('N')
self.assertEqual(next(x.find_data('rule_to_import')).children, ['N'])

def test_import_custom_sources3(self):
custom_loader2 = FromPackageLoader('tests')
grammar = """
%import .test_relative_import (start, WS)
%ignore WS
"""
p = Lark(grammar, import_paths=[custom_loader2], source_path=__file__) # import relative to current file
x = p.parse('12 capybaras')
self.assertEqual(x.children, ['12', 'capybaras'])


if __name__ == '__main__':


+ 0
- 58
tests/test_parser.py View File

@@ -11,7 +11,6 @@ from copy import copy, deepcopy
from lark.utils import Py36, isascii

from lark import Token
from lark.load_grammar import FromPackageLoader

try:
from cStringIO import StringIO as cStringIO
@@ -1380,12 +1379,6 @@ def _make_parser_test(LEXER, PARSER):
# A: "a" """)
# self.assertRaises(LexError, g.parse, 'aab')

def test_undefined_rule(self):
self.assertRaises(GrammarError, _Lark, """start: a""")

def test_undefined_token(self):
self.assertRaises(GrammarError, _Lark, """start: A""")

def test_rule_collision(self):
g = _Lark("""start: "a"+ "b"
| "a"+ """)
@@ -1619,15 +1612,6 @@ def _make_parser_test(LEXER, PARSER):
x = g.parse('abcdef')
self.assertEqual(x.children, ['abcdef'])

def test_token_multiline_only_works_with_x_flag(self):
g = r"""start: ABC
ABC: / a b c
d
e f
/i
"""
self.assertRaises( GrammarError, _Lark, g)

@unittest.skipIf(PARSER == 'cyk', "No empty rules")
def test_twice_empty(self):
g = """!start: ("A"?)?
@@ -1639,18 +1623,6 @@ def _make_parser_test(LEXER, PARSER):
tree = l.parse('')
self.assertEqual(tree.children, [])

def test_undefined_ignore(self):
g = """!start: "A"

%ignore B
"""
self.assertRaises( GrammarError, _Lark, g)

def test_alias_in_terminal(self):
g = """start: TERM
TERM: "a" -> alias
"""
self.assertRaises( GrammarError, _Lark, g)

def test_line_and_column(self):
g = r"""!start: "A" bc "D"
@@ -1950,36 +1922,6 @@ def _make_parser_test(LEXER, PARSER):
parser = _Lark(grammar, postlex=CustomIndenter())
parser.parse("a\n b\n")

def test_import_custom_sources(self):
custom_loader = FromPackageLoader('tests', ('grammars', ))

grammar = """
start: startab

%import ab.startab
"""

p = _Lark(grammar, import_paths=[custom_loader])
self.assertEqual(p.parse('ab'),
Tree('start', [Tree('startab', [Tree('ab__expr', [Token('ab__A', 'a'), Token('ab__B', 'b')])])]))

grammar = """
start: rule_to_import

%import test_relative_import_of_nested_grammar__grammar_to_import.rule_to_import
"""
p = _Lark(grammar, import_paths=[custom_loader])
x = p.parse('N')
self.assertEqual(next(x.find_data('rule_to_import')).children, ['N'])

custom_loader2 = FromPackageLoader('tests')
grammar = """
%import .test_relative_import (start, WS)
%ignore WS
"""
p = _Lark(grammar, import_paths=[custom_loader2], source_path=__file__) # import relative to current file
x = p.parse('12 capybaras')
self.assertEqual(x.children, ['12', 'capybaras'])

@unittest.skipIf(PARSER == 'cyk', "Doesn't work for CYK")
def test_prioritization(self):


Loading…
Cancel
Save