diff --git a/docs/grammar.md b/docs/grammar.md index b899b3f..0d77420 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -291,7 +291,7 @@ Declare a terminal without defining it. Useful for plugins. ### %override -Override a rule, affecting all the rules that refer to it. +Override a rule or terminals, affecting all references to it, even in imported grammars. Useful for implementing an inheritance pattern when importing grammars. @@ -302,3 +302,22 @@ Useful for implementing an inheritance pattern when importing grammars. // Add hex support to my_grammar %override number: NUMBER | /0x\w+/ ``` + +### %extend + +Extend the definition of a rule or terminal, e.g. add a new option on what it can match, like when separated with `|`. + +Useful for splitting up a definition of a complex rule with many different options over multiple files. + +Can also be used to implement a plugin system where a core grammar is extended by others. + + +**Example:** +```perl +%import my_grammar (start, NUMBER) + +// Add hex support to my_grammar +%extend NUMBER: /0x\w+/ +``` + +For both `%extend` and `%override`, there is not requirement for a rule/terminal to come from another file, but that is probably the most common usecase \ No newline at end of file diff --git a/examples/advanced/grammar_building.py b/examples/advanced/grammar_building.py new file mode 100644 index 0000000..0967045 --- /dev/null +++ b/examples/advanced/grammar_building.py @@ -0,0 +1,59 @@ +from pathlib import Path + +from lark.indenter import Indenter +from lark.lark import Lark +from lark.load_grammar import GrammarBuilder + +MATCH_GRAMMAR = ('match', """ + +%extend compound_stmt: match_stmt + +match_stmt: "match" test ":" cases + +cases: _NEWLINE _INDENT case+ _DEDENT + +case: "case" test ":" suite // test is not quite correct. + +""", ('compound_stmt', 'test', 'suite', '_DEDENT', '_INDENT', '_NEWLINE')) + +EXTENSIONS = (MATCH_GRAMMAR,) + +builder = GrammarBuilder() + +builder.load_grammar((Path(__file__).with_name('python3.lark')).read_text(), 'python3') + +for name, ext_grammar, needed_names in EXTENSIONS: + mangle = builder.get_mangle(name, dict(zip(needed_names, needed_names))) + builder.load_grammar(ext_grammar, name, mangle) + +grammar = builder.build() + + +class PythonIndenter(Indenter): + NL_type = '_NEWLINE' + OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE'] + CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE'] + INDENT_type = '_INDENT' + DEDENT_type = '_DEDENT' + tab_len = 8 + + +parser = Lark(grammar, parser='lalr', start=['single_input', 'file_input', 'eval_input'], postlex=PythonIndenter()) + +tree = parser.parse(r""" + +a = 5 + +def name(n): + match n: + case 1: + print("one") + case 2: + print("two") + case _: + print("number is to big") + +name(a) +""", start='file_input') + +print(tree.pretty()) diff --git a/lark-stubs/__init__.pyi b/lark-stubs/__init__.pyi index c010a93..c79a6ef 100644 --- a/lark-stubs/__init__.pyi +++ b/lark-stubs/__init__.pyi @@ -4,6 +4,7 @@ from .tree import * from .visitors import * from .exceptions import * from .lexer import * +from .load_grammar import * from .lark import * from logging import Logger as _Logger diff --git a/lark-stubs/grammar.pyi b/lark-stubs/grammar.pyi new file mode 100644 index 0000000..379d7a9 --- /dev/null +++ b/lark-stubs/grammar.pyi @@ -0,0 +1,9 @@ +from typing import Optional, Tuple + + +class RuleOptions: + keep_all_tokens: bool + expand1: bool + priority: int + template_source: Optional[str] + empty_indices: Tuple[bool, ...] \ No newline at end of file diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index ecbbb09..9246938 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -8,6 +8,7 @@ from .visitors import Transformer from .lexer import Token, Lexer, TerminalDef from .tree import Tree from .exceptions import UnexpectedInput +from .load_grammar import Grammar _T = TypeVar('_T') @@ -54,13 +55,14 @@ class FromPackageLoader: class Lark: source_path: str source_grammar: str + grammar: Grammar options: LarkOptions lexer: Lexer terminals: List[TerminalDef] def __init__( self, - grammar: Union[str, IO[str]], + grammar: Union[Grammar, str, IO[str]], *, start: Union[None, str, List[str]] = "start", parser: Literal["earley", "lalr", "cyk"] = "auto", diff --git a/lark-stubs/load_grammar.pyi b/lark-stubs/load_grammar.pyi new file mode 100644 index 0000000..cadd657 --- /dev/null +++ b/lark-stubs/load_grammar.pyi @@ -0,0 +1,28 @@ +from typing import List, Tuple, Union, Callable, Dict, Optional + +from lark import Tree +from lark.grammar import RuleOptions + + +class Grammar: + rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions]] + term_defs: List[Tuple[str, Tuple[Tree, int]]] + ignore: List[str] + + +class GrammarBuilder: + global_keep_all_tokens: bool + import_paths: List[Union[str, Callable]] + + def __init__(self, global_keep_all_tokens=..., import_paths=...): ... + + def load_grammar(self, grammar_text: str, grammar_name: str = ..., mangle: Callable[[str], str] = None): ... + + def do_import(self, dotted_path: Tuple[str, ...], base_path: Optional[str], aliases: Dict[str, str], + base_mangle: Callable[[str], str] = None): ... + + def get_mangle(self, prefix: str, aliases: Dict[str, str], base_mangle: Callable[[str], str] = None): ... + + def check(self): ... + + def build(self) -> Grammar: ... diff --git a/lark/grammars/common.lark b/lark/grammars/common.lark index 1158026..d2e86d1 100644 --- a/lark/grammars/common.lark +++ b/lark/grammars/common.lark @@ -55,5 +55,5 @@ NEWLINE: (CR? LF)+ // Comments SH_COMMENT: /#[^\n]*/ CPP_COMMENT: /\/\/[^\n]*/ -C_COMMENT: "/*" /.*?/s "*/" +C_COMMENT: "/*" /(.|\n)*?/ "*/" SQL_COMMENT: /--[^\n]*/ diff --git a/lark/lark.py b/lark/lark.py index 3e0a51f..b1b9270 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -7,7 +7,7 @@ import tempfile from warnings import warn from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger -from .load_grammar import load_grammar, FromPackageLoader +from .load_grammar import load_grammar, FromPackageLoader, Grammar from .tree import Tree from .common import LexerConf, ParserConf @@ -234,42 +234,50 @@ class Lark(Serialize): else: grammar = read() - assert isinstance(grammar, STRING_TYPE) - self.source_grammar = grammar - if self.options.use_bytes: - if not isascii(grammar): - raise ConfigurationError("Grammar must be ascii only, when use_bytes=True") - if sys.version_info[0] == 2 and self.options.use_bytes != 'force': - raise ConfigurationError("`use_bytes=True` may have issues on python2." - "Use `use_bytes='force'` to use it at your own risk.") - cache_fn = None - if self.options.cache: - if self.options.parser != 'lalr': - raise ConfigurationError("cache only works with parser='lalr' for now") - if isinstance(self.options.cache, STRING_TYPE): - cache_fn = self.options.cache - else: - if self.options.cache is not True: - raise ConfigurationError("cache argument must be bool or str") - unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals') - from . import __version__ - options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) - s = grammar + options_str + __version__ - md5 = hashlib.md5(s.encode()).hexdigest() - cache_fn = tempfile.gettempdir() + '/.lark_cache_%s.tmp' % md5 - - if FS.exists(cache_fn): - logger.debug('Loading grammar from cache: %s', cache_fn) - # Remove options that aren't relevant for loading from cache - for name in (set(options) - _LOAD_ALLOWED_OPTIONS): - del options[name] - with FS.open(cache_fn, 'rb') as f: - try: - self._load(f, **options) - except Exception: - raise RuntimeError("Failed to load Lark from cache: %r. Try to delete the file and run again." % cache_fn) - return + if isinstance(grammar, STRING_TYPE): + self.source_grammar = grammar + if self.options.use_bytes: + if not isascii(grammar): + raise ConfigurationError("Grammar must be ascii only, when use_bytes=True") + if sys.version_info[0] == 2 and self.options.use_bytes != 'force': + raise ConfigurationError("`use_bytes=True` may have issues on python2." + "Use `use_bytes='force'` to use it at your own risk.") + + if self.options.cache: + if self.options.parser != 'lalr': + raise ConfigurationError("cache only works with parser='lalr' for now") + if isinstance(self.options.cache, STRING_TYPE): + cache_fn = self.options.cache + else: + if self.options.cache is not True: + raise ConfigurationError("cache argument must be bool or str") + unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals') + from . import __version__ + options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) + s = grammar + options_str + __version__ + md5 = hashlib.md5(s.encode()).hexdigest() + cache_fn = tempfile.gettempdir() + '/.lark_cache_%s.tmp' % md5 + + if FS.exists(cache_fn): + logger.debug('Loading grammar from cache: %s', cache_fn) + # Remove options that aren't relevant for loading from cache + for name in (set(options) - _LOAD_ALLOWED_OPTIONS): + del options[name] + with FS.open(cache_fn, 'rb') as f: + try: + self._load(f, **options) + except Exception: + raise RuntimeError("Failed to load Lark from cache: %r. Try to delete the file and run again." % cache_fn) + return + + + # Parse the grammar file and compose the grammars + self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens) + else: + assert isinstance(grammar, Grammar) + self.grammar = grammar + if self.options.lexer == 'auto': if self.options.parser == 'lalr': @@ -301,9 +309,6 @@ class Lark(Serialize): if self.options.ambiguity not in _VALID_AMBIGUITY_OPTIONS: raise ConfigurationError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS)) - # Parse the grammar file and compose the grammars - self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens) - if self.options.postlex is not None: terminals_to_keep = set(self.options.postlex.always_accept) else: diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 0fafc1c..58cb7e3 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -6,6 +6,7 @@ from copy import copy, deepcopy from io import open import pkgutil from ast import literal_eval +from numbers import Integral from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start from .lexer import Token, TerminalDef, PatternStr, PatternRE @@ -95,6 +96,7 @@ TERMINALS = { '_IGNORE': r'%ignore', '_OVERRIDE': r'%override', '_DECLARE': r'%declare', + '_EXTEND': r'%extend', '_IMPORT': r'%import', 'NUMBER': r'[+-]?\d+', } @@ -149,8 +151,11 @@ RULES = { 'term': ['TERMINAL _COLON expansions _NL', 'TERMINAL _DOT NUMBER _COLON expansions _NL'], - 'statement': ['ignore', 'import', 'declare', 'override_rule'], - 'override_rule': ['_OVERRIDE rule'], + 'statement': ['ignore', 'import', 'declare', 'override', 'extend'], + 'override': ['_OVERRIDE rule', + '_OVERRIDE term'], + 'extend': ['_EXTEND rule', + '_EXTEND term'], 'ignore': ['_IGNORE expansions _NL'], 'declare': ['_DECLARE _declare_args _NL'], 'import': ['_IMPORT _import_path _NL', @@ -298,15 +303,6 @@ class RuleTreeToText(Transformer): return expansion, alias.value -@inline_args -class CanonizeTree(Transformer_InPlace): - def tokenmods(self, *args): - if len(args) == 1: - return list(args) - tokenmods, value = args - return tokenmods + [value] - - class PrepareAnonTerminals(Transformer_InPlace): """Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them""" @@ -546,10 +542,6 @@ class PrepareSymbols(Transformer_InPlace): assert False -def _choice_of_rules(rules): - return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules]) - - def nr_deepcopy_tree(t): """Deepcopy tree `t` without recursion""" return Transformer_NonRecursive(False).transform(t) @@ -736,58 +728,6 @@ class FromPackageLoader(object): stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS) -_imported_grammars = {} - - -def import_from_grammar_into_namespace(grammar, namespace, aliases): - """Returns all rules and terminals of grammar, prepended - with a 'namespace' prefix, except for those which are aliased. - """ - - imported_terms = dict(grammar.term_defs) - imported_rules = {n:(n,p,deepcopy(t),o) for n,p,t,o in grammar.rule_defs} - - term_defs = [] - rule_defs = [] - - def rule_dependencies(symbol): - if symbol.type != 'RULE': - return [] - try: - _, params, tree,_ = imported_rules[symbol] - except KeyError: - raise GrammarError("Missing symbol '%s' in grammar %s" % (symbol, namespace)) - return _find_used_symbols(tree) - set(params) - - def get_namespace_name(name, params): - if params is not None: - try: - return params[name] - except KeyError: - pass - try: - return aliases[name].value - except KeyError: - if name[0] == '_': - return '_%s__%s' % (namespace, name[1:]) - return '%s__%s' % (namespace, name) - - to_import = list(bfs(aliases, rule_dependencies)) - for symbol in to_import: - if symbol.type == 'TERMINAL': - term_defs.append([get_namespace_name(symbol, None), imported_terms[symbol]]) - else: - assert symbol.type == 'RULE' - _, params, tree, options = imported_rules[symbol] - params_map = {p: ('%s__%s' if p[0]!='_' else '_%s__%s') % (namespace, p) for p in params} - for t in tree.iter_subtrees(): - for i, c in enumerate(t.children): - if isinstance(c, Token) and c.type in ('RULE', 'TERMINAL'): - t.children[i] = Token(c.type, get_namespace_name(c, params_map)) - params = [params_map[p] for p in params] # We can not rely on ordered dictionaries - rule_defs.append((get_namespace_name(symbol, params_map), params, tree, options)) - - return term_defs, rule_defs def resolve_term_references(term_defs): @@ -859,8 +799,25 @@ def _find_used_symbols(tree): for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} -class GrammarLoader: - ERRORS = [ +def _grammar_parser(): + try: + return _grammar_parser.cache + except AttributeError: + terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] + + rules = [options_from_rule(name, None, x) for name, x in RULES.items()] + rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) + for r, _p, xs, o in rules for i, x in enumerate(xs)] + callback = ParseTreeBuilder(rules, ST).create_callback() + import re + lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT']) + parser_conf = ParserConf(rules, callback, ['start']) + lexer_conf.lexer_type = 'standard' + parser_conf.parser_type = 'lalr' + _grammar_parser.cache = ParsingFrontend(lexer_conf, parser_conf, {}) + return _grammar_parser.cache + +GRAMMAR_ERRORS = [ ('Unclosed parenthesis', ['a: (\n']), ('Unmatched closing parenthesis', ['a: )\n', 'a: [)\n', 'a: (]\n']), ('Expecting rule or terminal definition (missing colon)', ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n']), @@ -874,231 +831,291 @@ class GrammarLoader: ('%ignore expects a value', ['%ignore %import\n']), ] - def __init__(self, global_keep_all_tokens): - terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] - - rules = [options_from_rule(name, None, x) for name, x in RULES.items()] - rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) - for r, _p, xs, o in rules for i, x in enumerate(xs)] - callback = ParseTreeBuilder(rules, ST).create_callback() - import re - lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT']) - parser_conf = ParserConf(rules, callback, ['start']) - lexer_conf.lexer_type = 'standard' - parser_conf.parser_type = 'lalr' - self.parser = ParsingFrontend(lexer_conf, parser_conf, {}) - - self.canonize_tree = CanonizeTree() +def _parse_grammar(text, name, start='start'): + try: + return PrepareGrammar().transform(_grammar_parser().parse(text + '\n', start)) + except UnexpectedCharacters as e: + context = e.get_context(text) + raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" % + (e.line, e.column, name, context)) + except UnexpectedToken as e: + context = e.get_context(text) + error = e.match_examples(_grammar_parser().parse, GRAMMAR_ERRORS, use_accepts=True) + if error: + raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context)) + elif 'STRING' in e.expected: + raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context)) + raise + + +class GrammarBuilder: + def __init__(self, global_keep_all_tokens=False, import_paths=None): self.global_keep_all_tokens = global_keep_all_tokens - - def import_grammar(self, grammar_path, base_path=None, import_paths=[]): - if grammar_path not in _imported_grammars: - # import_paths take priority over base_path since they should handle relative imports and ignore everything else. - to_try = import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader] - for source in to_try: + self.import_paths = import_paths or [] + + self._definitions = {} + self._ignore_names = [] + + def _is_term(self, name): + # Imported terminals are of the form `Path__to__Grammar__file__TERMINAL_NAME` + # Only the last part is the actual name, and the rest might contain mixed case + return name.rpartition('__')[-1].isupper() + + def _grammar_error(self, msg, *names): + args = {} + for i, name in enumerate(names, start=1): + postfix = '' if i == 1 else str(i) + args['name' + postfix] = name + args['type' + postfix] = lowercase_type = ("rule", "terminal")[self._is_term(name)] + args['Type' + postfix] = lowercase_type.title() + raise GrammarError(msg.format(**args)) + + def _check_options(self, name, options): + if self._is_term(name): + if options is None: + options = 1 + # if we don't use Integral here, we run into python2.7/python3 problems with long vs int + elif not isinstance(options, Integral): + raise GrammarError("Terminal require a single int as 'options' (e.g. priority), got %s" % (type(options),)) + else: + if options is None: + options = RuleOptions() + elif not isinstance(options, RuleOptions): + raise GrammarError("Rules require a RuleOptions instance as 'options'") + if self.global_keep_all_tokens: + options.keep_all_tokens = True + return options + + + def _define(self, name, exp, params=(), options=None, override=False): + if (name in self._definitions) ^ override: + if override: + self._grammar_error("Cannot override a nonexisting {type} {name}", name) + else: + self._grammar_error("{Type} '{name}' defined more than once", name) + if name.startswith('__'): + self._grammar_error('Names starting with double-underscore are reserved (Error at {name})', name) + self._definitions[name] = (params, exp, self._check_options(name, options)) + + def _extend(self, name, exp, params=(), options=None): + if name not in self._definitions: + self._grammar_error("Can't extend {type} {name} as it wasn't defined before", name) + if tuple(params) != tuple(self._definitions[name][0]): + self._grammar_error("Cannot extend {type} with different parameters: {name}", name) + # TODO: think about what to do with 'options' + base = self._definitions[name][1] + + while len(base.children) == 2: + assert isinstance(base.children[0], Tree) and base.children[0].data == 'expansions', base + base = base.children[0] + base.children.insert(0, exp) + + def _ignore(self, exp_or_name): + if isinstance(exp_or_name, str): + self._ignore_names.append(exp_or_name) + else: + assert isinstance(exp_or_name, Tree) + t = exp_or_name + if t.data == 'expansions' and len(t.children) == 1: + t2 ,= t.children + if t2.data=='expansion' and len(t2.children) == 1: + item ,= t2.children + if item.data == 'value': + item ,= item.children + if isinstance(item, Token) and item.type == 'TERMINAL': + self._ignore_names.append(item.value) + return + + name = '__IGNORE_%d'% len(self._ignore_names) + self._ignore_names.append(name) + self._definitions[name] = ((), t, 1) + + def _declare(self, *names): + for name in names: + self._define(name, None) + + def _mangle_exp(self, exp, mangle): + if mangle is None: + return exp + exp = deepcopy(exp) # TODO: is this needed + for t in exp.iter_subtrees(): + for i, c in enumerate(t.children): + if isinstance(c, Token) and c.type in ('RULE', 'TERMINAL'): + t.children[i] = Token(c.type, mangle(c.value)) + return exp + + + def _unpack_definition(self, tree, mangle): + if tree.data == 'rule': + name, params, exp, opts = options_from_rule(*tree.children) + else: + name = tree.children[0].value + params = () + opts = int(tree.children[1]) if len(tree.children) == 3 else 1 # priority + exp = tree.children[-1] + if mangle is not None: + params = tuple(mangle(p) for p in params) + name = mangle(name) + exp = self._mangle_exp(exp, mangle) + return name, exp, params, opts + + def _unpack_import(self, stmt, grammar_name): + if len(stmt.children) > 1: + path_node, arg1 = stmt.children + else: + path_node, = stmt.children + arg1 = None + + if isinstance(arg1, Tree): # Multi import + dotted_path = tuple(path_node.children) + names = arg1.children + aliases = dict(zip(names, names)) # Can't have aliased multi import, so all aliases will be the same as names + else: # Single import + dotted_path = tuple(path_node.children[:-1]) + name = path_node.children[-1] # Get name from dotted path + aliases = {name.value: (arg1 or name).value} # Aliases if exist + + if path_node.data == 'import_lib': # Import from library + base_path = None + else: # Relative import + if grammar_name == '': # Import relative to script file path if grammar is coded in script try: - if callable(source): - joined_path, text = source(base_path, grammar_path) - else: - joined_path = os.path.join(source, grammar_path) - with open(joined_path, encoding='utf8') as f: - text = f.read() - except IOError: - continue - else: - grammar = self.load_grammar(text, joined_path, import_paths) - _imported_grammars[grammar_path] = grammar - break + base_file = os.path.abspath(sys.modules['__main__'].__file__) + except AttributeError: + base_file = None else: - # Search failed. Make Python throw a nice error. - open(grammar_path, encoding='utf8') - assert False - - return _imported_grammars[grammar_path] - - def load_grammar(self, grammar_text, grammar_name='', import_paths=[]): - """Parse grammar_text, verify, and create Grammar object. Display nice messages on error.""" - - try: - tree = self.canonize_tree.transform(self.parser.parse(grammar_text+'\n')) - except UnexpectedCharacters as e: - context = e.get_context(grammar_text) - raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" % - (e.line, e.column, grammar_name, context)) - except UnexpectedToken as e: - context = e.get_context(grammar_text) - error = e.match_examples(self.parser.parse, self.ERRORS, use_accepts=True) - if error: - raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context)) - elif 'STRING' in e.expected: - raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context)) - raise - - tree = PrepareGrammar().transform(tree) - - # Extract grammar items - defs = classify(tree.children, lambda c: c.data, lambda c: c.children) - term_defs = defs.pop('term', []) - rule_defs = defs.pop('rule', []) - statements = defs.pop('statement', []) - assert not defs - - term_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in term_defs] - term_defs = [(name.value, (t, int(p))) for name, p, t in term_defs] - rule_defs = [options_from_rule(*x) for x in rule_defs] - - # Execute statements - ignore, imports = [], {} - overriding_rules = [] - for (stmt,) in statements: - if stmt.data == 'ignore': - t ,= stmt.children - ignore.append(t) - elif stmt.data == 'import': - if len(stmt.children) > 1: - path_node, arg1 = stmt.children + base_file = grammar_name # Import relative to grammar file path if external grammar file + if base_file: + if isinstance(base_file, PackageResource): + base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0]) else: - path_node ,= stmt.children - arg1 = None - - if isinstance(arg1, Tree): # Multi import - dotted_path = tuple(path_node.children) - names = arg1.children - aliases = dict(zip(names, names)) # Can't have aliased multi import, so all aliases will be the same as names - else: # Single import - dotted_path = tuple(path_node.children[:-1]) - name = path_node.children[-1] # Get name from dotted path - aliases = {name: arg1 or name} # Aliases if exist - - if path_node.data == 'import_lib': # Import from library - base_path = None - else: # Relative import - if grammar_name == '': # Import relative to script file path if grammar is coded in script - try: - base_file = os.path.abspath(sys.modules['__main__'].__file__) - except AttributeError: - base_file = None - else: - base_file = grammar_name # Import relative to grammar file path if external grammar file - if base_file: - if isinstance(base_file, PackageResource): - base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0]) - else: - base_path = os.path.split(base_file)[0] - else: - base_path = os.path.abspath(os.path.curdir) - + base_path = os.path.split(base_file)[0] + else: + base_path = os.path.abspath(os.path.curdir) + + return dotted_path, base_path, aliases + + def load_grammar(self, grammar_text, grammar_name="", mangle=None): + tree = _parse_grammar(grammar_text, grammar_name) + imports = {} # imports are collect over the whole file to prevent duplications + actions = [] # Some statements need to be delayed (override and extend) till after imports are handled + for stmt in tree.children: + if stmt.data in ('term', 'rule'): + self._define(*self._unpack_definition(stmt, mangle)) + continue + assert stmt.data == 'statement', stmt.data + stmt ,= stmt.children + if stmt.data == 'import': + dotted_path, base_path, aliases = self._unpack_import(stmt, grammar_name) try: import_base_path, import_aliases = imports[dotted_path] assert base_path == import_base_path, 'Inconsistent base_path for %s.' % '.'.join(dotted_path) import_aliases.update(aliases) except KeyError: imports[dotted_path] = base_path, aliases - + elif stmt.data == 'ignore': + # if mangle is not None, we shouldn't apply ignore, since we aren't in a toplevel grammar + if mangle is None: + self._ignore(*stmt.children) elif stmt.data == 'declare': - for t in stmt.children: - term_defs.append([t.value, (None, None)]) - elif stmt.data == 'override_rule': + if mangle is None: + self._declare(*(t.value for t in stmt.children)) + else: + self._declare(*(mangle(t.value) for t in stmt.children)) + elif stmt.data == 'override': + r ,= stmt.children + actions.append((self._define, self._unpack_definition(r, mangle) + (True,))) + elif stmt.data == 'extend': r ,= stmt.children - overriding_rules.append(options_from_rule(*r.children)) + actions.append((self._extend, self._unpack_definition(r, mangle))) else: assert False, stmt - - # import grammars + for dotted_path, (base_path, aliases) in imports.items(): - grammar_path = os.path.join(*dotted_path) + EXT - g = self.import_grammar(grammar_path, base_path=base_path, import_paths=import_paths) - new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases) - - term_defs += new_td - rule_defs += new_rd - - # replace rules by overridding rules, according to name - for r in overriding_rules: - name = r[0] - # remove overridden rule from rule_defs - overridden, rule_defs = classify_bool(rule_defs, lambda r: r[0] == name) # FIXME inefficient - if not overridden: - raise GrammarError("Cannot override a nonexisting rule: %s" % name) - rule_defs.append(r) - - ## Handle terminals - - # Verify correctness 1 - for name, _ in term_defs: - if name.startswith('__'): - raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) - - # Handle ignore tokens - # XXX A slightly hacky solution. Recognition of %ignore TERMINAL as separate comes from the lexer's - # inability to handle duplicate terminals (two names, one value) - ignore_names = [] - for t in ignore: - if t.data=='expansions' and len(t.children) == 1: - t2 ,= t.children - if t2.data=='expansion' and len(t2.children) == 1: - item ,= t2.children - if item.data == 'value': - item ,= item.children - if isinstance(item, Token) and item.type == 'TERMINAL': - ignore_names.append(item.value) - continue - - name = '__IGNORE_%d'% len(ignore_names) - ignore_names.append(name) - term_defs.append((name, (t, 1))) - - # Verify correctness 2 - terminal_names = set() - for name, _ in term_defs: - if name in terminal_names: - raise GrammarError("Terminal '%s' defined more than once" % name) - terminal_names.add(name) - - if set(ignore_names) > terminal_names: - raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(ignore_names) - terminal_names)) - - resolve_term_references(term_defs) - - ## Handle rules - - rule_names = {} - for name, params, _x, option in rule_defs: - # We can't just simply not throw away the tokens later, we need option.keep_all_tokens to correctly generate maybe_placeholders - if self.global_keep_all_tokens: - option.keep_all_tokens = True - - if name.startswith('__'): - raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) - if name in rule_names: - raise GrammarError("Rule '%s' defined more than once" % name) - rule_names[name] = len(params) - - for name, params , expansions, _o in rule_defs: + self.do_import(dotted_path, base_path, aliases, mangle) + + for f, args in actions: + f(*args) + + def do_import(self, dotted_path, base_path, aliases, base_mangle=None): + mangle = self.get_mangle('__'.join(dotted_path), aliases, base_mangle) + grammar_path = os.path.join(*dotted_path) + EXT + to_try = self.import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader] + for source in to_try: + try: + if callable(source): + joined_path, text = source(base_path, grammar_path) + else: + joined_path = os.path.join(source, grammar_path) + with open(joined_path, encoding='utf8') as f: + text = f.read() + except IOError: + continue + else: + self.load_grammar(text, joined_path, mangle) + break + else: + # Search failed. Make Python throw a nice error. + open(grammar_path, encoding='utf8') + assert False, "Couldn't import grammar %s, but a corresponding file was found at a place where lark doesn't search for it" % (dotted_path,) + + def get_mangle(self, prefix, aliases, base_mangle=None): + def mangle(s): + if s in aliases: + s = aliases[s] + else: + if s[0] == '_': + s = '_%s__%s' % (prefix, s[1:]) + else: + s = '%s__%s' % (prefix, s) + if base_mangle is not None: + s = base_mangle(s) + return s + return mangle + + def check(self): + for name, (params, exp, options) in self._definitions.items(): for i, p in enumerate(params): - if p in rule_names: + if p in self._definitions: raise GrammarError("Template Parameter conflicts with rule %s (in template %s)" % (p, name)) if p in params[:i]: raise GrammarError("Duplicate Template Parameter %s (in template %s)" % (p, name)) - for temp in expansions.find_data('template_usage'): + + if exp is None: # Remaining checks don't work for abstract rules/terminals + continue + + for temp in exp.find_data('template_usage'): sym = temp.children[0] args = temp.children[1:] if sym not in params: - if sym not in rule_names: - raise GrammarError("Template '%s' used but not defined (in rule %s)" % (sym, name)) - if len(args) != rule_names[sym]: - raise GrammarError("Wrong number of template arguments used for %s " - "(expected %s, got %s) (in rule %s)" % (sym, rule_names[sym], len(args), name)) - for sym in _find_used_symbols(expansions): - if sym.type == 'TERMINAL': - if sym not in terminal_names: - raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name)) - else: - if sym not in rule_names and sym not in params: - raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name)) - - return Grammar(rule_defs, term_defs, ignore_names) - + if sym not in self._definitions: + self._grammar_error("Template '%s' used but not defined (in {type} {name})" % sym, name) + if len(args) != len(self._definitions[sym][0]): + expected, actual = len(self._definitions[sym][0]), len(args) + self._grammar_error("Wrong number of template arguments used for {name} " + "(expected %s, got %s) (in {type2} {name2})" % (expected, actual), sym, name) + + for sym in _find_used_symbols(exp): + if sym not in self._definitions and sym not in params: + self._grammar_error("{Type} '{name}' used but not defined (in {type2} {name2})", sym, name) + + if not set(self._definitions).issuperset(self._ignore_names): + raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(self._ignore_names) - set(self._definitions))) + + def build(self): + self.check() + rule_defs = [] + term_defs = [] + for name, (params, exp, options) in self._definitions.items(): + if self._is_term(name): + assert len(params) == 0 + term_defs.append((name, (exp, options))) + else: + rule_defs.append((name, params, exp, options)) + resolve_term_references(term_defs) + return Grammar(rule_defs, term_defs, self._ignore_names) def load_grammar(grammar, source, import_paths, global_keep_all_tokens): - return GrammarLoader(global_keep_all_tokens).load_grammar(grammar, source, import_paths) + builder = GrammarBuilder(global_keep_all_tokens, import_paths) + builder.load_grammar(grammar, source) + return builder.build() diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 3ce76f6..221fbc0 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -3,8 +3,8 @@ from __future__ import absolute_import import sys from unittest import TestCase, main -from lark import Lark -from lark.load_grammar import GrammarLoader, GrammarError +from lark import Lark, Token, Tree +from lark.load_grammar import GrammarError, GRAMMAR_ERRORS class TestGrammar(TestCase): @@ -12,7 +12,7 @@ class TestGrammar(TestCase): pass def test_errors(self): - for msg, examples in GrammarLoader.ERRORS: + for msg, examples in GRAMMAR_ERRORS: for example in examples: try: p = Lark(example) @@ -21,7 +21,7 @@ class TestGrammar(TestCase): else: assert False, "example did not raise an error" - def test_override(self): + def test_override_rule(self): # Overrides the 'sep' template in existing grammar to add an optional terminating delimiter # Thus extending it beyond its original capacity p = Lark(""" @@ -29,12 +29,42 @@ class TestGrammar(TestCase): %override sep{item, delim}: item (delim item)* delim? %ignore " " - """) + """, source_path=__file__) a = p.parse('[1, 2, 3]') b = p.parse('[1, 2, 3, ]') assert a == b + def test_override_terminal(self): + p = Lark(""" + + %import .grammars.ab (startab, A, B) + + %override A: "c" + %override B: "d" + """, start='startab', source_path=__file__) + + a = p.parse('cd') + self.assertEqual(a.children[0].children, [Token('A', 'c'), Token('B', 'd')]) + + def test_extend_rule(self): + p = Lark(""" + %import .grammars.ab (startab, A, B, expr) + + %extend expr: B A + """, start='startab', source_path=__file__) + a = p.parse('abab') + self.assertEqual(a.children[0].children, ['a', Tree('expr', ['b', 'a']), 'b']) + + def test_extend_term(self): + p = Lark(""" + %import .grammars.ab (startab, A, B, expr) + + %extend A: "c" + """, start='startab', source_path=__file__) + a = p.parse('acbb') + self.assertEqual(a.children[0].children, ['a', Tree('expr', ['c', 'b']), 'b']) + if __name__ == '__main__':