Browse Source

Merge branch 'grammar_builder' of https://github.com/MegaIng/lark into MegaIng-grammar_builder

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.2
Erez Sh 3 years ago
parent
commit
2f2cebedf8
10 changed files with 490 additions and 320 deletions
  1. +20
    -1
      docs/grammar.md
  2. +59
    -0
      examples/advanced/grammar_building.py
  3. +1
    -0
      lark-stubs/__init__.pyi
  4. +9
    -0
      lark-stubs/grammar.pyi
  5. +3
    -1
      lark-stubs/lark.pyi
  6. +28
    -0
      lark-stubs/load_grammar.pyi
  7. +1
    -1
      lark/grammars/common.lark
  8. +44
    -39
      lark/lark.py
  9. +290
    -273
      lark/load_grammar.py
  10. +35
    -5
      tests/test_grammar.py

+ 20
- 1
docs/grammar.md View File

@@ -291,7 +291,7 @@ Declare a terminal without defining it. Useful for plugins.


### %override ### %override


Override a rule, affecting all the rules that refer to it.
Override a rule or terminals, affecting all references to it, even in imported grammars.


Useful for implementing an inheritance pattern when importing grammars. Useful for implementing an inheritance pattern when importing grammars.


@@ -302,3 +302,22 @@ Useful for implementing an inheritance pattern when importing grammars.
// Add hex support to my_grammar // Add hex support to my_grammar
%override number: NUMBER | /0x\w+/ %override number: NUMBER | /0x\w+/
``` ```

### %extend

Extend the definition of a rule or terminal, e.g. add a new option on what it can match, like when separated with `|`.

Useful for splitting up a definition of a complex rule with many different options over multiple files.

Can also be used to implement a plugin system where a core grammar is extended by others.


**Example:**
```perl
%import my_grammar (start, NUMBER)

// Add hex support to my_grammar
%extend NUMBER: /0x\w+/
```

For both `%extend` and `%override`, there is not requirement for a rule/terminal to come from another file, but that is probably the most common usecase

+ 59
- 0
examples/advanced/grammar_building.py View File

@@ -0,0 +1,59 @@
from pathlib import Path

from lark.indenter import Indenter
from lark.lark import Lark
from lark.load_grammar import GrammarBuilder

MATCH_GRAMMAR = ('match', """

%extend compound_stmt: match_stmt

match_stmt: "match" test ":" cases

cases: _NEWLINE _INDENT case+ _DEDENT

case: "case" test ":" suite // test is not quite correct.

""", ('compound_stmt', 'test', 'suite', '_DEDENT', '_INDENT', '_NEWLINE'))

EXTENSIONS = (MATCH_GRAMMAR,)

builder = GrammarBuilder()

builder.load_grammar((Path(__file__).with_name('python3.lark')).read_text(), 'python3')

for name, ext_grammar, needed_names in EXTENSIONS:
mangle = builder.get_mangle(name, dict(zip(needed_names, needed_names)))
builder.load_grammar(ext_grammar, name, mangle)

grammar = builder.build()


class PythonIndenter(Indenter):
NL_type = '_NEWLINE'
OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE']
CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE']
INDENT_type = '_INDENT'
DEDENT_type = '_DEDENT'
tab_len = 8


parser = Lark(grammar, parser='lalr', start=['single_input', 'file_input', 'eval_input'], postlex=PythonIndenter())

tree = parser.parse(r"""

a = 5

def name(n):
match n:
case 1:
print("one")
case 2:
print("two")
case _:
print("number is to big")

name(a)
""", start='file_input')

print(tree.pretty())

+ 1
- 0
lark-stubs/__init__.pyi View File

@@ -4,6 +4,7 @@ from .tree import *
from .visitors import * from .visitors import *
from .exceptions import * from .exceptions import *
from .lexer import * from .lexer import *
from .load_grammar import *
from .lark import * from .lark import *
from logging import Logger as _Logger from logging import Logger as _Logger




+ 9
- 0
lark-stubs/grammar.pyi View File

@@ -0,0 +1,9 @@
from typing import Optional, Tuple


class RuleOptions:
keep_all_tokens: bool
expand1: bool
priority: int
template_source: Optional[str]
empty_indices: Tuple[bool, ...]

+ 3
- 1
lark-stubs/lark.pyi View File

@@ -8,6 +8,7 @@ from .visitors import Transformer
from .lexer import Token, Lexer, TerminalDef from .lexer import Token, Lexer, TerminalDef
from .tree import Tree from .tree import Tree
from .exceptions import UnexpectedInput from .exceptions import UnexpectedInput
from .load_grammar import Grammar


_T = TypeVar('_T') _T = TypeVar('_T')


@@ -54,13 +55,14 @@ class FromPackageLoader:
class Lark: class Lark:
source_path: str source_path: str
source_grammar: str source_grammar: str
grammar: Grammar
options: LarkOptions options: LarkOptions
lexer: Lexer lexer: Lexer
terminals: List[TerminalDef] terminals: List[TerminalDef]


def __init__( def __init__(
self, self,
grammar: Union[str, IO[str]],
grammar: Union[Grammar, str, IO[str]],
*, *,
start: Union[None, str, List[str]] = "start", start: Union[None, str, List[str]] = "start",
parser: Literal["earley", "lalr", "cyk"] = "auto", parser: Literal["earley", "lalr", "cyk"] = "auto",


+ 28
- 0
lark-stubs/load_grammar.pyi View File

@@ -0,0 +1,28 @@
from typing import List, Tuple, Union, Callable, Dict, Optional

from lark import Tree
from lark.grammar import RuleOptions


class Grammar:
rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions]]
term_defs: List[Tuple[str, Tuple[Tree, int]]]
ignore: List[str]


class GrammarBuilder:
global_keep_all_tokens: bool
import_paths: List[Union[str, Callable]]

def __init__(self, global_keep_all_tokens=..., import_paths=...): ...

def load_grammar(self, grammar_text: str, grammar_name: str = ..., mangle: Callable[[str], str] = None): ...

def do_import(self, dotted_path: Tuple[str, ...], base_path: Optional[str], aliases: Dict[str, str],
base_mangle: Callable[[str], str] = None): ...

def get_mangle(self, prefix: str, aliases: Dict[str, str], base_mangle: Callable[[str], str] = None): ...

def check(self): ...

def build(self) -> Grammar: ...

+ 1
- 1
lark/grammars/common.lark View File

@@ -55,5 +55,5 @@ NEWLINE: (CR? LF)+
// Comments // Comments
SH_COMMENT: /#[^\n]*/ SH_COMMENT: /#[^\n]*/
CPP_COMMENT: /\/\/[^\n]*/ CPP_COMMENT: /\/\/[^\n]*/
C_COMMENT: "/*" /.*?/s "*/"
C_COMMENT: "/*" /(.|\n)*?/ "*/"
SQL_COMMENT: /--[^\n]*/ SQL_COMMENT: /--[^\n]*/

+ 44
- 39
lark/lark.py View File

@@ -7,7 +7,7 @@ import tempfile
from warnings import warn from warnings import warn


from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger
from .load_grammar import load_grammar, FromPackageLoader
from .load_grammar import load_grammar, FromPackageLoader, Grammar
from .tree import Tree from .tree import Tree
from .common import LexerConf, ParserConf from .common import LexerConf, ParserConf


@@ -234,42 +234,50 @@ class Lark(Serialize):
else: else:
grammar = read() grammar = read()


assert isinstance(grammar, STRING_TYPE)
self.source_grammar = grammar
if self.options.use_bytes:
if not isascii(grammar):
raise ConfigurationError("Grammar must be ascii only, when use_bytes=True")
if sys.version_info[0] == 2 and self.options.use_bytes != 'force':
raise ConfigurationError("`use_bytes=True` may have issues on python2."
"Use `use_bytes='force'` to use it at your own risk.")

cache_fn = None cache_fn = None
if self.options.cache:
if self.options.parser != 'lalr':
raise ConfigurationError("cache only works with parser='lalr' for now")
if isinstance(self.options.cache, STRING_TYPE):
cache_fn = self.options.cache
else:
if self.options.cache is not True:
raise ConfigurationError("cache argument must be bool or str")
unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
from . import __version__
options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
s = grammar + options_str + __version__
md5 = hashlib.md5(s.encode()).hexdigest()
cache_fn = tempfile.gettempdir() + '/.lark_cache_%s.tmp' % md5

if FS.exists(cache_fn):
logger.debug('Loading grammar from cache: %s', cache_fn)
# Remove options that aren't relevant for loading from cache
for name in (set(options) - _LOAD_ALLOWED_OPTIONS):
del options[name]
with FS.open(cache_fn, 'rb') as f:
try:
self._load(f, **options)
except Exception:
raise RuntimeError("Failed to load Lark from cache: %r. Try to delete the file and run again." % cache_fn)
return
if isinstance(grammar, STRING_TYPE):
self.source_grammar = grammar
if self.options.use_bytes:
if not isascii(grammar):
raise ConfigurationError("Grammar must be ascii only, when use_bytes=True")
if sys.version_info[0] == 2 and self.options.use_bytes != 'force':
raise ConfigurationError("`use_bytes=True` may have issues on python2."
"Use `use_bytes='force'` to use it at your own risk.")
if self.options.cache:
if self.options.parser != 'lalr':
raise ConfigurationError("cache only works with parser='lalr' for now")
if isinstance(self.options.cache, STRING_TYPE):
cache_fn = self.options.cache
else:
if self.options.cache is not True:
raise ConfigurationError("cache argument must be bool or str")
unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
from . import __version__
options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
s = grammar + options_str + __version__
md5 = hashlib.md5(s.encode()).hexdigest()
cache_fn = tempfile.gettempdir() + '/.lark_cache_%s.tmp' % md5
if FS.exists(cache_fn):
logger.debug('Loading grammar from cache: %s', cache_fn)
# Remove options that aren't relevant for loading from cache
for name in (set(options) - _LOAD_ALLOWED_OPTIONS):
del options[name]
with FS.open(cache_fn, 'rb') as f:
try:
self._load(f, **options)
except Exception:
raise RuntimeError("Failed to load Lark from cache: %r. Try to delete the file and run again." % cache_fn)
return


# Parse the grammar file and compose the grammars
self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
else:
assert isinstance(grammar, Grammar)
self.grammar = grammar


if self.options.lexer == 'auto': if self.options.lexer == 'auto':
if self.options.parser == 'lalr': if self.options.parser == 'lalr':
@@ -301,9 +309,6 @@ class Lark(Serialize):
if self.options.ambiguity not in _VALID_AMBIGUITY_OPTIONS: if self.options.ambiguity not in _VALID_AMBIGUITY_OPTIONS:
raise ConfigurationError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS)) raise ConfigurationError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS))


# Parse the grammar file and compose the grammars
self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)

if self.options.postlex is not None: if self.options.postlex is not None:
terminals_to_keep = set(self.options.postlex.always_accept) terminals_to_keep = set(self.options.postlex.always_accept)
else: else:


+ 290
- 273
lark/load_grammar.py View File

@@ -6,6 +6,7 @@ from copy import copy, deepcopy
from io import open from io import open
import pkgutil import pkgutil
from ast import literal_eval from ast import literal_eval
from numbers import Integral


from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start
from .lexer import Token, TerminalDef, PatternStr, PatternRE from .lexer import Token, TerminalDef, PatternStr, PatternRE
@@ -95,6 +96,7 @@ TERMINALS = {
'_IGNORE': r'%ignore', '_IGNORE': r'%ignore',
'_OVERRIDE': r'%override', '_OVERRIDE': r'%override',
'_DECLARE': r'%declare', '_DECLARE': r'%declare',
'_EXTEND': r'%extend',
'_IMPORT': r'%import', '_IMPORT': r'%import',
'NUMBER': r'[+-]?\d+', 'NUMBER': r'[+-]?\d+',
} }
@@ -149,8 +151,11 @@ RULES = {


'term': ['TERMINAL _COLON expansions _NL', 'term': ['TERMINAL _COLON expansions _NL',
'TERMINAL _DOT NUMBER _COLON expansions _NL'], 'TERMINAL _DOT NUMBER _COLON expansions _NL'],
'statement': ['ignore', 'import', 'declare', 'override_rule'],
'override_rule': ['_OVERRIDE rule'],
'statement': ['ignore', 'import', 'declare', 'override', 'extend'],
'override': ['_OVERRIDE rule',
'_OVERRIDE term'],
'extend': ['_EXTEND rule',
'_EXTEND term'],
'ignore': ['_IGNORE expansions _NL'], 'ignore': ['_IGNORE expansions _NL'],
'declare': ['_DECLARE _declare_args _NL'], 'declare': ['_DECLARE _declare_args _NL'],
'import': ['_IMPORT _import_path _NL', 'import': ['_IMPORT _import_path _NL',
@@ -298,15 +303,6 @@ class RuleTreeToText(Transformer):
return expansion, alias.value return expansion, alias.value




@inline_args
class CanonizeTree(Transformer_InPlace):
def tokenmods(self, *args):
if len(args) == 1:
return list(args)
tokenmods, value = args
return tokenmods + [value]


class PrepareAnonTerminals(Transformer_InPlace): class PrepareAnonTerminals(Transformer_InPlace):
"""Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them""" """Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them"""


@@ -546,10 +542,6 @@ class PrepareSymbols(Transformer_InPlace):
assert False assert False




def _choice_of_rules(rules):
return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules])


def nr_deepcopy_tree(t): def nr_deepcopy_tree(t):
"""Deepcopy tree `t` without recursion""" """Deepcopy tree `t` without recursion"""
return Transformer_NonRecursive(False).transform(t) return Transformer_NonRecursive(False).transform(t)
@@ -736,58 +728,6 @@ class FromPackageLoader(object):


stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS) stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS)


_imported_grammars = {}


def import_from_grammar_into_namespace(grammar, namespace, aliases):
"""Returns all rules and terminals of grammar, prepended
with a 'namespace' prefix, except for those which are aliased.
"""

imported_terms = dict(grammar.term_defs)
imported_rules = {n:(n,p,deepcopy(t),o) for n,p,t,o in grammar.rule_defs}

term_defs = []
rule_defs = []

def rule_dependencies(symbol):
if symbol.type != 'RULE':
return []
try:
_, params, tree,_ = imported_rules[symbol]
except KeyError:
raise GrammarError("Missing symbol '%s' in grammar %s" % (symbol, namespace))
return _find_used_symbols(tree) - set(params)

def get_namespace_name(name, params):
if params is not None:
try:
return params[name]
except KeyError:
pass
try:
return aliases[name].value
except KeyError:
if name[0] == '_':
return '_%s__%s' % (namespace, name[1:])
return '%s__%s' % (namespace, name)

to_import = list(bfs(aliases, rule_dependencies))
for symbol in to_import:
if symbol.type == 'TERMINAL':
term_defs.append([get_namespace_name(symbol, None), imported_terms[symbol]])
else:
assert symbol.type == 'RULE'
_, params, tree, options = imported_rules[symbol]
params_map = {p: ('%s__%s' if p[0]!='_' else '_%s__%s') % (namespace, p) for p in params}
for t in tree.iter_subtrees():
for i, c in enumerate(t.children):
if isinstance(c, Token) and c.type in ('RULE', 'TERMINAL'):
t.children[i] = Token(c.type, get_namespace_name(c, params_map))
params = [params_map[p] for p in params] # We can not rely on ordered dictionaries
rule_defs.append((get_namespace_name(symbol, params_map), params, tree, options))

return term_defs, rule_defs




def resolve_term_references(term_defs): def resolve_term_references(term_defs):
@@ -859,8 +799,25 @@ def _find_used_symbols(tree):
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}




class GrammarLoader:
ERRORS = [
def _grammar_parser():
try:
return _grammar_parser.cache
except AttributeError:
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]
rules = [options_from_rule(name, None, x) for name, x in RULES.items()]
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o)
for r, _p, xs, o in rules for i, x in enumerate(xs)]
callback = ParseTreeBuilder(rules, ST).create_callback()
import re
lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT'])
parser_conf = ParserConf(rules, callback, ['start'])
lexer_conf.lexer_type = 'standard'
parser_conf.parser_type = 'lalr'
_grammar_parser.cache = ParsingFrontend(lexer_conf, parser_conf, {})
return _grammar_parser.cache

GRAMMAR_ERRORS = [
('Unclosed parenthesis', ['a: (\n']), ('Unclosed parenthesis', ['a: (\n']),
('Unmatched closing parenthesis', ['a: )\n', 'a: [)\n', 'a: (]\n']), ('Unmatched closing parenthesis', ['a: )\n', 'a: [)\n', 'a: (]\n']),
('Expecting rule or terminal definition (missing colon)', ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n']), ('Expecting rule or terminal definition (missing colon)', ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n']),
@@ -874,231 +831,291 @@ class GrammarLoader:
('%ignore expects a value', ['%ignore %import\n']), ('%ignore expects a value', ['%ignore %import\n']),
] ]


def __init__(self, global_keep_all_tokens):
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]

rules = [options_from_rule(name, None, x) for name, x in RULES.items()]
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o)
for r, _p, xs, o in rules for i, x in enumerate(xs)]
callback = ParseTreeBuilder(rules, ST).create_callback()
import re
lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT'])
parser_conf = ParserConf(rules, callback, ['start'])
lexer_conf.lexer_type = 'standard'
parser_conf.parser_type = 'lalr'
self.parser = ParsingFrontend(lexer_conf, parser_conf, {})

self.canonize_tree = CanonizeTree()
def _parse_grammar(text, name, start='start'):
try:
return PrepareGrammar().transform(_grammar_parser().parse(text + '\n', start))
except UnexpectedCharacters as e:
context = e.get_context(text)
raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" %
(e.line, e.column, name, context))
except UnexpectedToken as e:
context = e.get_context(text)
error = e.match_examples(_grammar_parser().parse, GRAMMAR_ERRORS, use_accepts=True)
if error:
raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context))
elif 'STRING' in e.expected:
raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context))
raise


class GrammarBuilder:
def __init__(self, global_keep_all_tokens=False, import_paths=None):
self.global_keep_all_tokens = global_keep_all_tokens self.global_keep_all_tokens = global_keep_all_tokens

def import_grammar(self, grammar_path, base_path=None, import_paths=[]):
if grammar_path not in _imported_grammars:
# import_paths take priority over base_path since they should handle relative imports and ignore everything else.
to_try = import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader]
for source in to_try:
self.import_paths = import_paths or []

self._definitions = {}
self._ignore_names = []
def _is_term(self, name):
# Imported terminals are of the form `Path__to__Grammar__file__TERMINAL_NAME`
# Only the last part is the actual name, and the rest might contain mixed case
return name.rpartition('__')[-1].isupper()

def _grammar_error(self, msg, *names):
args = {}
for i, name in enumerate(names, start=1):
postfix = '' if i == 1 else str(i)
args['name' + postfix] = name
args['type' + postfix] = lowercase_type = ("rule", "terminal")[self._is_term(name)]
args['Type' + postfix] = lowercase_type.title()
raise GrammarError(msg.format(**args))
def _check_options(self, name, options):
if self._is_term(name):
if options is None:
options = 1
# if we don't use Integral here, we run into python2.7/python3 problems with long vs int
elif not isinstance(options, Integral):
raise GrammarError("Terminal require a single int as 'options' (e.g. priority), got %s" % (type(options),))
else:
if options is None:
options = RuleOptions()
elif not isinstance(options, RuleOptions):
raise GrammarError("Rules require a RuleOptions instance as 'options'")
if self.global_keep_all_tokens:
options.keep_all_tokens = True
return options

def _define(self, name, exp, params=(), options=None, override=False):
if (name in self._definitions) ^ override:
if override:
self._grammar_error("Cannot override a nonexisting {type} {name}", name)
else:
self._grammar_error("{Type} '{name}' defined more than once", name)
if name.startswith('__'):
self._grammar_error('Names starting with double-underscore are reserved (Error at {name})', name)
self._definitions[name] = (params, exp, self._check_options(name, options))

def _extend(self, name, exp, params=(), options=None):
if name not in self._definitions:
self._grammar_error("Can't extend {type} {name} as it wasn't defined before", name)
if tuple(params) != tuple(self._definitions[name][0]):
self._grammar_error("Cannot extend {type} with different parameters: {name}", name)
# TODO: think about what to do with 'options'
base = self._definitions[name][1]

while len(base.children) == 2:
assert isinstance(base.children[0], Tree) and base.children[0].data == 'expansions', base
base = base.children[0]
base.children.insert(0, exp)

def _ignore(self, exp_or_name):
if isinstance(exp_or_name, str):
self._ignore_names.append(exp_or_name)
else:
assert isinstance(exp_or_name, Tree)
t = exp_or_name
if t.data == 'expansions' and len(t.children) == 1:
t2 ,= t.children
if t2.data=='expansion' and len(t2.children) == 1:
item ,= t2.children
if item.data == 'value':
item ,= item.children
if isinstance(item, Token) and item.type == 'TERMINAL':
self._ignore_names.append(item.value)
return

name = '__IGNORE_%d'% len(self._ignore_names)
self._ignore_names.append(name)
self._definitions[name] = ((), t, 1)
def _declare(self, *names):
for name in names:
self._define(name, None)
def _mangle_exp(self, exp, mangle):
if mangle is None:
return exp
exp = deepcopy(exp) # TODO: is this needed
for t in exp.iter_subtrees():
for i, c in enumerate(t.children):
if isinstance(c, Token) and c.type in ('RULE', 'TERMINAL'):
t.children[i] = Token(c.type, mangle(c.value))
return exp

def _unpack_definition(self, tree, mangle):
if tree.data == 'rule':
name, params, exp, opts = options_from_rule(*tree.children)
else:
name = tree.children[0].value
params = ()
opts = int(tree.children[1]) if len(tree.children) == 3 else 1 # priority
exp = tree.children[-1]
if mangle is not None:
params = tuple(mangle(p) for p in params)
name = mangle(name)
exp = self._mangle_exp(exp, mangle)
return name, exp, params, opts
def _unpack_import(self, stmt, grammar_name):
if len(stmt.children) > 1:
path_node, arg1 = stmt.children
else:
path_node, = stmt.children
arg1 = None

if isinstance(arg1, Tree): # Multi import
dotted_path = tuple(path_node.children)
names = arg1.children
aliases = dict(zip(names, names)) # Can't have aliased multi import, so all aliases will be the same as names
else: # Single import
dotted_path = tuple(path_node.children[:-1])
name = path_node.children[-1] # Get name from dotted path
aliases = {name.value: (arg1 or name).value} # Aliases if exist

if path_node.data == 'import_lib': # Import from library
base_path = None
else: # Relative import
if grammar_name == '<string>': # Import relative to script file path if grammar is coded in script
try: try:
if callable(source):
joined_path, text = source(base_path, grammar_path)
else:
joined_path = os.path.join(source, grammar_path)
with open(joined_path, encoding='utf8') as f:
text = f.read()
except IOError:
continue
else:
grammar = self.load_grammar(text, joined_path, import_paths)
_imported_grammars[grammar_path] = grammar
break
base_file = os.path.abspath(sys.modules['__main__'].__file__)
except AttributeError:
base_file = None
else: else:
# Search failed. Make Python throw a nice error.
open(grammar_path, encoding='utf8')
assert False

return _imported_grammars[grammar_path]

def load_grammar(self, grammar_text, grammar_name='<?>', import_paths=[]):
"""Parse grammar_text, verify, and create Grammar object. Display nice messages on error."""

try:
tree = self.canonize_tree.transform(self.parser.parse(grammar_text+'\n'))
except UnexpectedCharacters as e:
context = e.get_context(grammar_text)
raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" %
(e.line, e.column, grammar_name, context))
except UnexpectedToken as e:
context = e.get_context(grammar_text)
error = e.match_examples(self.parser.parse, self.ERRORS, use_accepts=True)
if error:
raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context))
elif 'STRING' in e.expected:
raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context))
raise

tree = PrepareGrammar().transform(tree)

# Extract grammar items
defs = classify(tree.children, lambda c: c.data, lambda c: c.children)
term_defs = defs.pop('term', [])
rule_defs = defs.pop('rule', [])
statements = defs.pop('statement', [])
assert not defs

term_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in term_defs]
term_defs = [(name.value, (t, int(p))) for name, p, t in term_defs]
rule_defs = [options_from_rule(*x) for x in rule_defs]

# Execute statements
ignore, imports = [], {}
overriding_rules = []
for (stmt,) in statements:
if stmt.data == 'ignore':
t ,= stmt.children
ignore.append(t)
elif stmt.data == 'import':
if len(stmt.children) > 1:
path_node, arg1 = stmt.children
base_file = grammar_name # Import relative to grammar file path if external grammar file
if base_file:
if isinstance(base_file, PackageResource):
base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0])
else: else:
path_node ,= stmt.children
arg1 = None

if isinstance(arg1, Tree): # Multi import
dotted_path = tuple(path_node.children)
names = arg1.children
aliases = dict(zip(names, names)) # Can't have aliased multi import, so all aliases will be the same as names
else: # Single import
dotted_path = tuple(path_node.children[:-1])
name = path_node.children[-1] # Get name from dotted path
aliases = {name: arg1 or name} # Aliases if exist

if path_node.data == 'import_lib': # Import from library
base_path = None
else: # Relative import
if grammar_name == '<string>': # Import relative to script file path if grammar is coded in script
try:
base_file = os.path.abspath(sys.modules['__main__'].__file__)
except AttributeError:
base_file = None
else:
base_file = grammar_name # Import relative to grammar file path if external grammar file
if base_file:
if isinstance(base_file, PackageResource):
base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0])
else:
base_path = os.path.split(base_file)[0]
else:
base_path = os.path.abspath(os.path.curdir)

base_path = os.path.split(base_file)[0]
else:
base_path = os.path.abspath(os.path.curdir)
return dotted_path, base_path, aliases

def load_grammar(self, grammar_text, grammar_name="<?>", mangle=None):
tree = _parse_grammar(grammar_text, grammar_name)
imports = {} # imports are collect over the whole file to prevent duplications
actions = [] # Some statements need to be delayed (override and extend) till after imports are handled
for stmt in tree.children:
if stmt.data in ('term', 'rule'):
self._define(*self._unpack_definition(stmt, mangle))
continue
assert stmt.data == 'statement', stmt.data
stmt ,= stmt.children
if stmt.data == 'import':
dotted_path, base_path, aliases = self._unpack_import(stmt, grammar_name)
try: try:
import_base_path, import_aliases = imports[dotted_path] import_base_path, import_aliases = imports[dotted_path]
assert base_path == import_base_path, 'Inconsistent base_path for %s.' % '.'.join(dotted_path) assert base_path == import_base_path, 'Inconsistent base_path for %s.' % '.'.join(dotted_path)
import_aliases.update(aliases) import_aliases.update(aliases)
except KeyError: except KeyError:
imports[dotted_path] = base_path, aliases imports[dotted_path] = base_path, aliases

elif stmt.data == 'ignore':
# if mangle is not None, we shouldn't apply ignore, since we aren't in a toplevel grammar
if mangle is None:
self._ignore(*stmt.children)
elif stmt.data == 'declare': elif stmt.data == 'declare':
for t in stmt.children:
term_defs.append([t.value, (None, None)])
elif stmt.data == 'override_rule':
if mangle is None:
self._declare(*(t.value for t in stmt.children))
else:
self._declare(*(mangle(t.value) for t in stmt.children))
elif stmt.data == 'override':
r ,= stmt.children
actions.append((self._define, self._unpack_definition(r, mangle) + (True,)))
elif stmt.data == 'extend':
r ,= stmt.children r ,= stmt.children
overriding_rules.append(options_from_rule(*r.children))
actions.append((self._extend, self._unpack_definition(r, mangle)))
else: else:
assert False, stmt assert False, stmt

# import grammars
for dotted_path, (base_path, aliases) in imports.items(): for dotted_path, (base_path, aliases) in imports.items():
grammar_path = os.path.join(*dotted_path) + EXT
g = self.import_grammar(grammar_path, base_path=base_path, import_paths=import_paths)
new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases)

term_defs += new_td
rule_defs += new_rd

# replace rules by overridding rules, according to name
for r in overriding_rules:
name = r[0]
# remove overridden rule from rule_defs
overridden, rule_defs = classify_bool(rule_defs, lambda r: r[0] == name) # FIXME inefficient
if not overridden:
raise GrammarError("Cannot override a nonexisting rule: %s" % name)
rule_defs.append(r)

## Handle terminals

# Verify correctness 1
for name, _ in term_defs:
if name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)

# Handle ignore tokens
# XXX A slightly hacky solution. Recognition of %ignore TERMINAL as separate comes from the lexer's
# inability to handle duplicate terminals (two names, one value)
ignore_names = []
for t in ignore:
if t.data=='expansions' and len(t.children) == 1:
t2 ,= t.children
if t2.data=='expansion' and len(t2.children) == 1:
item ,= t2.children
if item.data == 'value':
item ,= item.children
if isinstance(item, Token) and item.type == 'TERMINAL':
ignore_names.append(item.value)
continue

name = '__IGNORE_%d'% len(ignore_names)
ignore_names.append(name)
term_defs.append((name, (t, 1)))

# Verify correctness 2
terminal_names = set()
for name, _ in term_defs:
if name in terminal_names:
raise GrammarError("Terminal '%s' defined more than once" % name)
terminal_names.add(name)

if set(ignore_names) > terminal_names:
raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(ignore_names) - terminal_names))

resolve_term_references(term_defs)

## Handle rules

rule_names = {}
for name, params, _x, option in rule_defs:
# We can't just simply not throw away the tokens later, we need option.keep_all_tokens to correctly generate maybe_placeholders
if self.global_keep_all_tokens:
option.keep_all_tokens = True

if name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
if name in rule_names:
raise GrammarError("Rule '%s' defined more than once" % name)
rule_names[name] = len(params)

for name, params , expansions, _o in rule_defs:
self.do_import(dotted_path, base_path, aliases, mangle)
for f, args in actions:
f(*args)
def do_import(self, dotted_path, base_path, aliases, base_mangle=None):
mangle = self.get_mangle('__'.join(dotted_path), aliases, base_mangle)
grammar_path = os.path.join(*dotted_path) + EXT
to_try = self.import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader]
for source in to_try:
try:
if callable(source):
joined_path, text = source(base_path, grammar_path)
else:
joined_path = os.path.join(source, grammar_path)
with open(joined_path, encoding='utf8') as f:
text = f.read()
except IOError:
continue
else:
self.load_grammar(text, joined_path, mangle)
break
else:
# Search failed. Make Python throw a nice error.
open(grammar_path, encoding='utf8')
assert False, "Couldn't import grammar %s, but a corresponding file was found at a place where lark doesn't search for it" % (dotted_path,)

def get_mangle(self, prefix, aliases, base_mangle=None):
def mangle(s):
if s in aliases:
s = aliases[s]
else:
if s[0] == '_':
s = '_%s__%s' % (prefix, s[1:])
else:
s = '%s__%s' % (prefix, s)
if base_mangle is not None:
s = base_mangle(s)
return s
return mangle

def check(self):
for name, (params, exp, options) in self._definitions.items():
for i, p in enumerate(params): for i, p in enumerate(params):
if p in rule_names:
if p in self._definitions:
raise GrammarError("Template Parameter conflicts with rule %s (in template %s)" % (p, name)) raise GrammarError("Template Parameter conflicts with rule %s (in template %s)" % (p, name))
if p in params[:i]: if p in params[:i]:
raise GrammarError("Duplicate Template Parameter %s (in template %s)" % (p, name)) raise GrammarError("Duplicate Template Parameter %s (in template %s)" % (p, name))
for temp in expansions.find_data('template_usage'):
if exp is None: # Remaining checks don't work for abstract rules/terminals
continue

for temp in exp.find_data('template_usage'):
sym = temp.children[0] sym = temp.children[0]
args = temp.children[1:] args = temp.children[1:]
if sym not in params: if sym not in params:
if sym not in rule_names:
raise GrammarError("Template '%s' used but not defined (in rule %s)" % (sym, name))
if len(args) != rule_names[sym]:
raise GrammarError("Wrong number of template arguments used for %s "
"(expected %s, got %s) (in rule %s)" % (sym, rule_names[sym], len(args), name))
for sym in _find_used_symbols(expansions):
if sym.type == 'TERMINAL':
if sym not in terminal_names:
raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name))
else:
if sym not in rule_names and sym not in params:
raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name))

return Grammar(rule_defs, term_defs, ignore_names)

if sym not in self._definitions:
self._grammar_error("Template '%s' used but not defined (in {type} {name})" % sym, name)
if len(args) != len(self._definitions[sym][0]):
expected, actual = len(self._definitions[sym][0]), len(args)
self._grammar_error("Wrong number of template arguments used for {name} "
"(expected %s, got %s) (in {type2} {name2})" % (expected, actual), sym, name)
for sym in _find_used_symbols(exp):
if sym not in self._definitions and sym not in params:
self._grammar_error("{Type} '{name}' used but not defined (in {type2} {name2})", sym, name)

if not set(self._definitions).issuperset(self._ignore_names):
raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(self._ignore_names) - set(self._definitions)))

def build(self):
self.check()
rule_defs = []
term_defs = []
for name, (params, exp, options) in self._definitions.items():
if self._is_term(name):
assert len(params) == 0
term_defs.append((name, (exp, options)))
else:
rule_defs.append((name, params, exp, options))
resolve_term_references(term_defs)
return Grammar(rule_defs, term_defs, self._ignore_names)


def load_grammar(grammar, source, import_paths, global_keep_all_tokens): def load_grammar(grammar, source, import_paths, global_keep_all_tokens):
return GrammarLoader(global_keep_all_tokens).load_grammar(grammar, source, import_paths)
builder = GrammarBuilder(global_keep_all_tokens, import_paths)
builder.load_grammar(grammar, source)
return builder.build()

+ 35
- 5
tests/test_grammar.py View File

@@ -3,8 +3,8 @@ from __future__ import absolute_import
import sys import sys
from unittest import TestCase, main from unittest import TestCase, main


from lark import Lark
from lark.load_grammar import GrammarLoader, GrammarError
from lark import Lark, Token, Tree
from lark.load_grammar import GrammarError, GRAMMAR_ERRORS




class TestGrammar(TestCase): class TestGrammar(TestCase):
@@ -12,7 +12,7 @@ class TestGrammar(TestCase):
pass pass


def test_errors(self): def test_errors(self):
for msg, examples in GrammarLoader.ERRORS:
for msg, examples in GRAMMAR_ERRORS:
for example in examples: for example in examples:
try: try:
p = Lark(example) p = Lark(example)
@@ -21,7 +21,7 @@ class TestGrammar(TestCase):
else: else:
assert False, "example did not raise an error" assert False, "example did not raise an error"


def test_override(self):
def test_override_rule(self):
# Overrides the 'sep' template in existing grammar to add an optional terminating delimiter # Overrides the 'sep' template in existing grammar to add an optional terminating delimiter
# Thus extending it beyond its original capacity # Thus extending it beyond its original capacity
p = Lark(""" p = Lark("""
@@ -29,12 +29,42 @@ class TestGrammar(TestCase):


%override sep{item, delim}: item (delim item)* delim? %override sep{item, delim}: item (delim item)* delim?
%ignore " " %ignore " "
""")
""", source_path=__file__)


a = p.parse('[1, 2, 3]') a = p.parse('[1, 2, 3]')
b = p.parse('[1, 2, 3, ]') b = p.parse('[1, 2, 3, ]')
assert a == b assert a == b


def test_override_terminal(self):
p = Lark("""
%import .grammars.ab (startab, A, B)
%override A: "c"
%override B: "d"
""", start='startab', source_path=__file__)

a = p.parse('cd')
self.assertEqual(a.children[0].children, [Token('A', 'c'), Token('B', 'd')])

def test_extend_rule(self):
p = Lark("""
%import .grammars.ab (startab, A, B, expr)

%extend expr: B A
""", start='startab', source_path=__file__)
a = p.parse('abab')
self.assertEqual(a.children[0].children, ['a', Tree('expr', ['b', 'a']), 'b'])

def test_extend_term(self):
p = Lark("""
%import .grammars.ab (startab, A, B, expr)
%extend A: "c"
""", start='startab', source_path=__file__)
a = p.parse('acbb')
self.assertEqual(a.children[0].children, ['a', Tree('expr', ['c', 'b']), 'b'])





if __name__ == '__main__': if __name__ == '__main__':


Loading…
Cancel
Save