"""Parses and creates Grammar objects""" import hashlib import os.path import sys from collections import namedtuple from copy import copy, deepcopy import pkgutil from ast import literal_eval from contextlib import suppress from typing import List, Tuple, Union, Callable, Dict, Optional from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import ParsingFrontend from .common import LexerConf, ParserConf from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, TOKEN_DEFAULT_PRIORITY from .utils import classify, dedup_list from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError, UnexpectedInput from .tree import Tree, SlottedTree as ST from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive inline_args = v_args(inline=True) __path__ = os.path.dirname(__file__) IMPORT_PATHS = ['grammars'] EXT = '.lark' _RE_FLAGS = 'imslux' _EMPTY = Symbol('__empty__') _TERMINAL_NAMES = { '.' : 'DOT', ',' : 'COMMA', ':' : 'COLON', ';' : 'SEMICOLON', '+' : 'PLUS', '-' : 'MINUS', '*' : 'STAR', '/' : 'SLASH', '\\' : 'BACKSLASH', '|' : 'VBAR', '?' : 'QMARK', '!' : 'BANG', '@' : 'AT', '#' : 'HASH', '$' : 'DOLLAR', '%' : 'PERCENT', '^' : 'CIRCUMFLEX', '&' : 'AMPERSAND', '_' : 'UNDERSCORE', '<' : 'LESSTHAN', '>' : 'MORETHAN', '=' : 'EQUAL', '"' : 'DBLQUOTE', '\'' : 'QUOTE', '`' : 'BACKQUOTE', '~' : 'TILDE', '(' : 'LPAR', ')' : 'RPAR', '{' : 'LBRACE', '}' : 'RBRACE', '[' : 'LSQB', ']' : 'RSQB', '\n' : 'NEWLINE', '\r\n' : 'CRLF', '\t' : 'TAB', ' ' : 'SPACE', } # Grammar Parser TERMINALS = { '_LPAR': r'\(', '_RPAR': r'\)', '_LBRA': r'\[', '_RBRA': r'\]', '_LBRACE': r'\{', '_RBRACE': r'\}', 'OP': '[+*]|[?](?![a-z])', '_COLON': ':', '_COMMA': ',', '_OR': r'\|', '_DOT': r'\.(?!\.)', '_DOTDOT': r'\.\.', 'TILDE': '~', 'RULE': '!?[_?]?[a-z][_a-z0-9]*', 'TERMINAL': '_?[A-Z][_A-Z0-9]*', 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS, '_NL': r'(\r?\n)+\s*', '_NL_OR': r'(\r?\n)+\s*\|', 'WS': r'[ \t]+', 'COMMENT': r'\s*//[^\n]*', 'BACKSLASH': r'\\[ ]*\n', '_TO': '->', '_IGNORE': r'%ignore', '_OVERRIDE': r'%override', '_DECLARE': r'%declare', '_EXTEND': r'%extend', '_IMPORT': r'%import', 'NUMBER': r'[+-]?\d+', } RULES = { 'start': ['_list'], '_list': ['_item', '_list _item'], '_item': ['rule', 'term', 'ignore', 'import', 'declare', 'override', 'extend', '_NL'], 'rule': ['RULE template_params _COLON expansions _NL', 'RULE template_params _DOT NUMBER _COLON expansions _NL'], 'template_params': ['_LBRACE _template_params _RBRACE', ''], '_template_params': ['RULE', '_template_params _COMMA RULE'], 'expansions': ['_expansions'], '_expansions': ['alias', '_expansions _OR alias', '_expansions _NL_OR alias'], '?alias': ['expansion _TO RULE', 'expansion'], 'expansion': ['_expansion'], '_expansion': ['', '_expansion expr'], '?expr': ['atom', 'atom OP', 'atom TILDE NUMBER', 'atom TILDE NUMBER _DOTDOT NUMBER', ], '?atom': ['_LPAR expansions _RPAR', 'maybe', 'value'], 'value': ['terminal', 'nonterminal', 'literal', 'range', 'template_usage'], 'terminal': ['TERMINAL'], 'nonterminal': ['RULE'], '?name': ['RULE', 'TERMINAL'], 'maybe': ['_LBRA expansions _RBRA'], 'range': ['STRING _DOTDOT STRING'], 'template_usage': ['RULE _LBRACE _template_args _RBRACE'], '_template_args': ['value', '_template_args _COMMA value'], 'term': ['TERMINAL _COLON expansions _NL', 'TERMINAL _DOT NUMBER _COLON expansions _NL'], 'override': ['_OVERRIDE rule', '_OVERRIDE term'], 'extend': ['_EXTEND rule', '_EXTEND term'], 'ignore': ['_IGNORE expansions _NL'], 'declare': ['_DECLARE _declare_args _NL'], 'import': ['_IMPORT _import_path _NL', '_IMPORT _import_path _LPAR name_list _RPAR _NL', '_IMPORT _import_path _TO name _NL'], '_import_path': ['import_lib', 'import_rel'], 'import_lib': ['_import_args'], 'import_rel': ['_DOT _import_args'], '_import_args': ['name', '_import_args _DOT name'], 'name_list': ['_name_list'], '_name_list': ['name', '_name_list _COMMA name'], '_declare_args': ['name', '_declare_args name'], 'literal': ['REGEXP', 'STRING'], } # Value 5 keeps the number of states in the lalr parser somewhat minimal # It isn't optimal, but close to it. See PR #949 SMALL_FACTOR_THRESHOLD = 5 # The Threshold whether repeat via ~ are split up into different rules # 50 is chosen since it keeps the number of states low and therefore lalr analysis time low, # while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts. # (See PR #949) REPEAT_BREAK_THRESHOLD = 50 @inline_args class EBNF_to_BNF(Transformer_InPlace): def __init__(self): self.new_rules = [] self.rules_cache = {} self.prefix = 'anon' self.i = 0 self.rule_options = None def _name_rule(self, inner): new_name = '__%s_%s_%d' % (self.prefix, inner, self.i) self.i += 1 return new_name def _add_rule(self, key, name, expansions): t = NonTerminal(name) self.new_rules.append((name, expansions, self.rule_options)) self.rules_cache[key] = t return t def _add_recurse_rule(self, type_, expr): try: return self.rules_cache[expr] except KeyError: new_name = self._name_rule(type_) t = NonTerminal(new_name) tree = ST('expansions', [ ST('expansion', [expr]), ST('expansion', [t, expr]) ]) return self._add_rule(expr, new_name, tree) def _add_repeat_rule(self, a, b, target, atom): """Generate a rule that repeats target ``a`` times, and repeats atom ``b`` times. When called recursively (into target), it repeats atom for x(n) times, where: x(0) = 1 x(n) = a(n) * x(n-1) + b Example rule when a=3, b=4: new_rule: target target target atom atom atom atom """ key = (a, b, target, atom) try: return self.rules_cache[key] except KeyError: new_name = self._name_rule('repeat_a%d_b%d' % (a, b)) tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)]) return self._add_rule(key, new_name, tree) def _add_repeat_opt_rule(self, a, b, target, target_opt, atom): """Creates a rule that matches atom 0 to (a*n+b)-1 times. When target matches n times atom, and target_opt 0 to n-1 times target_opt, First we generate target * i followed by target_opt, for i from 0 to a-1 These match 0 to n*a - 1 times atom Then we generate target * a followed by atom * i, for i from 0 to b-1 These match n*a to n*a + b-1 times atom The created rule will not have any shift/reduce conflicts so that it can be used with lalr Example rule when a=3, b=4: new_rule: target_opt | target target_opt | target target target_opt | target target target | target target target atom | target target target atom atom | target target target atom atom atom """ key = (a, b, target, atom, "opt") try: return self.rules_cache[key] except KeyError: new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b)) tree = ST('expansions', [ ST('expansion', [target]*i + [target_opt]) for i in range(a) ] + [ ST('expansion', [target]*a + [atom]*i) for i in range(b) ]) return self._add_rule(key, new_name, tree) def _generate_repeats(self, rule, mn, mx): """Generates a rule tree that repeats ``rule`` exactly between ``mn`` to ``mx`` times. """ # For a small number of repeats, we can take the naive approach if mx < REPEAT_BREAK_THRESHOLD: return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) # For large repeat values, we break the repetition into sub-rules. # We treat ``rule~mn..mx`` as ``rule~mn rule~0..(diff=mx-mn)``. # We then use small_factors to split up mn and diff up into values [(a, b), ...] # This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt # to generate a complete rule/expression that matches the corresponding number of repeats mn_target = rule for a, b in small_factors(mn, SMALL_FACTOR_THRESHOLD): mn_target = self._add_repeat_rule(a, b, mn_target, rule) if mx == mn: return mn_target diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less diff_factors = small_factors(diff, SMALL_FACTOR_THRESHOLD) diff_target = rule # Match rule 1 times diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times) for a, b in diff_factors[:-1]: diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) diff_target = self._add_repeat_rule(a, b, diff_target, rule) a, b = diff_factors[-1] diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) def expr(self, rule, op, *args): if op.value == '?': empty = ST('expansion', []) return ST('expansions', [rule, empty]) elif op.value == '+': # a : b c+ d # --> # a : b _c d # _c : _c c | c; return self._add_recurse_rule('plus', rule) elif op.value == '*': # a : b c* d # --> # a : b _c? d # _c : _c c | c; new_name = self._add_recurse_rule('star', rule) return ST('expansions', [new_name, ST('expansion', [])]) elif op.value == '~': if len(args) == 1: mn = mx = int(args[0]) else: mn, mx = map(int, args) if mx < mn or mn < 0: raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) return self._generate_repeats(rule, mn, mx) assert False, op def maybe(self, rule): keep_all_tokens = self.rule_options and self.rule_options.keep_all_tokens def will_not_get_removed(sym): if isinstance(sym, NonTerminal): return not sym.name.startswith('_') if isinstance(sym, Terminal): return keep_all_tokens or not sym.filter_out if sym is _EMPTY: return False assert False, sym empty = ST('expansion', [_EMPTY] * len(list(rule.scan_values(will_not_get_removed)))) return ST('expansions', [rule, empty]) class SimplifyRule_Visitor(Visitor): @staticmethod def _flatten(tree): while tree.expand_kids_by_data(tree.data): pass def expansion(self, tree): # rules_list unpacking # a : b (c|d) e # --> # a : b c e | b d e # # In AST terms: # expansion(b, expansions(c, d), e) # --> # expansions( expansion(b, c, e), expansion(b, d, e) ) self._flatten(tree) for i, child in enumerate(tree.children): if isinstance(child, Tree) and child.data == 'expansions': tree.data = 'expansions' tree.children = [self.visit(ST('expansion', [option if i == j else other for j, other in enumerate(tree.children)])) for option in dedup_list(child.children)] self._flatten(tree) break def alias(self, tree): rule, alias_name = tree.children if rule.data == 'expansions': aliases = [] for child in tree.children[0].children: aliases.append(ST('alias', [child, alias_name])) tree.data = 'expansions' tree.children = aliases def expansions(self, tree): self._flatten(tree) # Ensure all children are unique if len(set(tree.children)) != len(tree.children): tree.children = dedup_list(tree.children) # dedup is expensive, so try to minimize its use class RuleTreeToText(Transformer): def expansions(self, x): return x def expansion(self, symbols): return symbols, None def alias(self, x): (expansion, _alias), alias = x assert _alias is None, (alias, expansion, '-', _alias) # Double alias not allowed return expansion, alias.value class PrepareAnonTerminals(Transformer_InPlace): """Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them""" def __init__(self, terminals): self.terminals = terminals self.term_set = {td.name for td in self.terminals} self.term_reverse = {td.pattern: td for td in terminals} self.i = 0 self.rule_options = None @inline_args def pattern(self, p): value = p.value if p in self.term_reverse and p.flags != self.term_reverse[p].pattern.flags: raise GrammarError(u'Conflicting flags for the same terminal: %s' % p) term_name = None if isinstance(p, PatternStr): try: # If already defined, use the user-defined terminal name term_name = self.term_reverse[p].name except KeyError: # Try to assign an indicative anon-terminal name try: term_name = _TERMINAL_NAMES[value] except KeyError: if value and is_id_continue(value) and is_id_start(value[0]) and value.upper() not in self.term_set: term_name = value.upper() if term_name in self.term_set: term_name = None elif isinstance(p, PatternRE): if p in self.term_reverse: # Kind of a weird placement.name term_name = self.term_reverse[p].name else: assert False, p if term_name is None: term_name = '__ANON_%d' % self.i self.i += 1 if term_name not in self.term_set: assert p not in self.term_reverse self.term_set.add(term_name) termdef = TerminalDef(term_name, p) self.term_reverse[p] = termdef self.terminals.append(termdef) filter_out = False if self.rule_options and self.rule_options.keep_all_tokens else isinstance(p, PatternStr) return Terminal(term_name, filter_out=filter_out) class _ReplaceSymbols(Transformer_InPlace): """Helper for ApplyTemplates""" def __init__(self): self.names = {} def value(self, c): if len(c) == 1 and isinstance(c[0], Token) and c[0].value in self.names: return self.names[c[0].value] return self.__default__('value', c, None) def template_usage(self, c): if c[0] in self.names: return self.__default__('template_usage', [self.names[c[0]].name] + c[1:], None) return self.__default__('template_usage', c, None) class ApplyTemplates(Transformer_InPlace): """Apply the templates, creating new rules that represent the used templates""" def __init__(self, rule_defs): self.rule_defs = rule_defs self.replacer = _ReplaceSymbols() self.created_templates = set() def template_usage(self, c): name = c[0] args = c[1:] result_name = "%s{%s}" % (name, ",".join(a.name for a in args)) if result_name not in self.created_templates: self.created_templates.add(result_name) (_n, params, tree, options) ,= (t for t in self.rule_defs if t[0] == name) assert len(params) == len(args), args result_tree = deepcopy(tree) self.replacer.names = dict(zip(params, args)) self.replacer.transform(result_tree) self.rule_defs.append((result_name, [], result_tree, deepcopy(options))) return NonTerminal(result_name) def _rfind(s, choices): return max(s.rfind(c) for c in choices) def eval_escaping(s): w = '' i = iter(s) for n in i: w += n if n == '\\': try: n2 = next(i) except StopIteration: raise GrammarError("Literal ended unexpectedly (bad escaping): `%r`" % s) if n2 == '\\': w += '\\\\' elif n2 not in 'Uuxnftr': w += '\\' w += n2 w = w.replace('\\"', '"').replace("'", "\\'") to_eval = "u'''%s'''" % w try: s = literal_eval(to_eval) except SyntaxError as e: raise GrammarError(s, e) return s def _literal_to_pattern(literal): v = literal.value flag_start = _rfind(v, '/"')+1 assert flag_start > 0 flags = v[flag_start:] assert all(f in _RE_FLAGS for f in flags), flags if literal.type == 'STRING' and '\n' in v: raise GrammarError('You cannot put newlines in string literals') if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags: raise GrammarError('You can only use newlines in regular expressions ' 'with the `x` (verbose) flag') v = v[:flag_start] assert v[0] == v[-1] and v[0] in '"/' x = v[1:-1] s = eval_escaping(x) if s == "": raise GrammarError("Empty terminals are not allowed (%s)" % literal) if literal.type == 'STRING': s = s.replace('\\\\', '\\') return PatternStr(s, flags, raw=literal.value) elif literal.type == 'REGEXP': return PatternRE(s, flags, raw=literal.value) else: assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]' @inline_args class PrepareLiterals(Transformer_InPlace): def literal(self, literal): return ST('pattern', [_literal_to_pattern(literal)]) def range(self, start, end): assert start.type == end.type == 'STRING' start = start.value[1:-1] end = end.value[1:-1] assert len(eval_escaping(start)) == len(eval_escaping(end)) == 1 regexp = '[%s-%s]' % (start, end) return ST('pattern', [PatternRE(regexp)]) def _make_joined_pattern(regexp, flags_set): return PatternRE(regexp, ()) class TerminalTreeToPattern(Transformer_NonRecursive): def pattern(self, ps): p ,= ps return p def expansion(self, items): assert items if len(items) == 1: return items[0] pattern = ''.join(i.to_regexp() for i in items) return _make_joined_pattern(pattern, {i.flags for i in items}) def expansions(self, exps): if len(exps) == 1: return exps[0] # Do a bit of sorting to make sure that the longest option is returned # (Python's re module otherwise prefers just 'l' when given (l|ll) and both could match) exps.sort(key=lambda x: (-x.max_width, -x.min_width, -len(x.value))) pattern = '(?:%s)' % ('|'.join(i.to_regexp() for i in exps)) return _make_joined_pattern(pattern, {i.flags for i in exps}) def expr(self, args): inner, op = args[:2] if op == '~': if len(args) == 3: op = "{%d}" % int(args[2]) else: mn, mx = map(int, args[2:]) if mx < mn: raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (inner, mn, mx)) op = "{%d,%d}" % (mn, mx) else: assert len(args) == 2 return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags) def maybe(self, expr): return self.expr(expr + ['?']) def alias(self, t): raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") def value(self, v): return v[0] class PrepareSymbols(Transformer_InPlace): def value(self, v): v ,= v if isinstance(v, Tree): return v elif v.type == 'RULE': return NonTerminal(str(v.value)) elif v.type == 'TERMINAL': return Terminal(str(v.value), filter_out=v.startswith('_')) assert False def nr_deepcopy_tree(t): """Deepcopy tree `t` without recursion""" return Transformer_NonRecursive(False).transform(t) class Grammar: term_defs: List[Tuple[str, Tuple[Tree, int]]] rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions]] ignore: List[str] def __init__(self, rule_defs: List[Tuple[str, Tuple[str, ...], Tree, RuleOptions]], term_defs: List[Tuple[str, Tuple[Tree, int]]], ignore: List[str]) -> None: self.term_defs = term_defs self.rule_defs = rule_defs self.ignore = ignore def compile(self, start, terminals_to_keep): # We change the trees in-place (to support huge grammars) # So deepcopy allows calling compile more than once. term_defs = [(n, (nr_deepcopy_tree(t), p)) for n, (t, p) in self.term_defs] rule_defs = [(n, p, nr_deepcopy_tree(t), o) for n, p, t, o in self.rule_defs] # =================== # Compile Terminals # =================== # Convert terminal-trees to strings/regexps for name, (term_tree, priority) in term_defs: if term_tree is None: # Terminal added through %declare continue expansions = list(term_tree.find_data('expansion')) if len(expansions) == 1 and not expansions[0].children: raise GrammarError("Terminals cannot be empty (%s)" % name) transformer = PrepareLiterals() * TerminalTreeToPattern() terminals = [TerminalDef(name, transformer.transform(term_tree), priority) for name, (term_tree, priority) in term_defs if term_tree] # ================= # Compile Rules # ================= # 1. Pre-process terminals anon_tokens_transf = PrepareAnonTerminals(terminals) transformer = PrepareLiterals() * PrepareSymbols() * anon_tokens_transf # Adds to terminals # 2. Inline Templates transformer *= ApplyTemplates(rule_defs) # 3. Convert EBNF to BNF (and apply step 1 & 2) ebnf_to_bnf = EBNF_to_BNF() rules = [] i = 0 while i < len(rule_defs): # We have to do it like this because rule_defs might grow due to templates name, params, rule_tree, options = rule_defs[i] i += 1 if len(params) != 0: # Dont transform templates continue rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None ebnf_to_bnf.rule_options = rule_options ebnf_to_bnf.prefix = name anon_tokens_transf.rule_options = rule_options tree = transformer.transform(rule_tree) res = ebnf_to_bnf.transform(tree) rules.append((name, res, options)) rules += ebnf_to_bnf.new_rules assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision" # 4. Compile tree to Rule objects rule_tree_to_text = RuleTreeToText() simplify_rule = SimplifyRule_Visitor() compiled_rules = [] for rule_content in rules: name, tree, options = rule_content simplify_rule.visit(tree) expansions = rule_tree_to_text.transform(tree) for i, (expansion, alias) in enumerate(expansions): if alias and name.startswith('_'): raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)"% (name, alias)) empty_indices = [x==_EMPTY for x in expansion] if any(empty_indices): exp_options = copy(options) or RuleOptions() exp_options.empty_indices = empty_indices expansion = [x for x in expansion if x!=_EMPTY] else: exp_options = options for sym in expansion: assert isinstance(sym, Symbol) if sym.is_term and exp_options and exp_options.keep_all_tokens: sym.filter_out = False rule = Rule(NonTerminal(name), expansion, i, alias, exp_options) compiled_rules.append(rule) # Remove duplicates of empty rules, throw error for non-empty duplicates if len(set(compiled_rules)) != len(compiled_rules): duplicates = classify(compiled_rules, lambda x: x) for dups in duplicates.values(): if len(dups) > 1: if dups[0].expansion: raise GrammarError("Rules defined twice: %s\n\n(Might happen due to colliding expansion of optionals: [] or ?)" % ''.join('\n * %s' % i for i in dups)) # Empty rule; assert all other attributes are equal assert len({(r.alias, r.order, r.options) for r in dups}) == len(dups) # Remove duplicates compiled_rules = list(set(compiled_rules)) # Filter out unused rules while True: c = len(compiled_rules) used_rules = {s for r in compiled_rules for s in r.expansion if isinstance(s, NonTerminal) and s != r.origin} used_rules |= {NonTerminal(s) for s in start} compiled_rules, unused = classify_bool(compiled_rules, lambda r: r.origin in used_rules) for r in unused: logger.debug("Unused rule: %s", r) if len(compiled_rules) == c: break # Filter out unused terminals if terminals_to_keep != '*': used_terms = {t.name for r in compiled_rules for t in r.expansion if isinstance(t, Terminal)} terminals, unused = classify_bool(terminals, lambda t: t.name in used_terms or t.name in self.ignore or t.name in terminals_to_keep) if unused: logger.debug("Unused terminals: %s", [t.name for t in unused]) return terminals, compiled_rules, self.ignore PackageResource = namedtuple('PackageResource', 'pkg_name path') class FromPackageLoader: """ Provides a simple way of creating custom import loaders that load from packages via ``pkgutil.get_data`` instead of using `open`. This allows them to be compatible even from within zip files. Relative imports are handled, so you can just freely use them. pkg_name: The name of the package. You can probably provide `__name__` most of the time search_paths: All the path that will be search on absolute imports. """ pkg_name: str search_paths: Tuple[str, ...] def __init__(self, pkg_name: str, search_paths: Tuple[str, ...]=("", )) -> None: self.pkg_name = pkg_name self.search_paths = search_paths def __repr__(self): return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.search_paths) def __call__(self, base_path: Union[None, str, PackageResource], grammar_path: str) -> Tuple[PackageResource, str]: if base_path is None: to_try = self.search_paths else: # Check whether or not the importing grammar was loaded by this module. if not isinstance(base_path, PackageResource) or base_path.pkg_name != self.pkg_name: # Technically false, but FileNotFound doesn't exist in python2.7, and this message should never reach the end user anyway raise IOError() to_try = [base_path.path] for path in to_try: full_path = os.path.join(path, grammar_path) try: text = pkgutil.get_data(self.pkg_name, full_path) except IOError: continue else: return PackageResource(self.pkg_name, full_path), text.decode() raise IOError() stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS) def resolve_term_references(term_dict): # TODO Solve with transitive closure (maybe) while True: changed = False for name, token_tree in term_dict.items(): if token_tree is None: # Terminal added through %declare continue for exp in token_tree.find_data('value'): item ,= exp.children if isinstance(item, Token): if item.type == 'RULE': raise GrammarError("Rules aren't allowed inside terminals (%s in %s)" % (item, name)) if item.type == 'TERMINAL': try: term_value = term_dict[item] except KeyError: raise GrammarError("Terminal used but not defined: %s" % item) assert term_value is not None exp.children[0] = term_value changed = True if not changed: break for name, term in term_dict.items(): if term: # Not just declared for child in term.children: ids = [id(x) for x in child.iter_subtrees()] if id(term) in ids: raise GrammarError("Recursion in terminal '%s' (recursion is only allowed in rules, not terminals)" % name) def options_from_rule(name, params, *x): if len(x) > 1: priority, expansions = x priority = int(priority) else: expansions ,= x priority = None params = [t.value for t in params.children] if params is not None else [] # For the grammar parser keep_all_tokens = name.startswith('!') name = name.lstrip('!') expand1 = name.startswith('?') name = name.lstrip('?') return name, params, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority, template_source=(name if params else None)) def symbols_from_strcase(expansion): return [Terminal(x, filter_out=x.startswith('_')) if x.isupper() else NonTerminal(x) for x in expansion] @inline_args class PrepareGrammar(Transformer_InPlace): def terminal(self, name): return name def nonterminal(self, name): return name def _find_used_symbols(tree): assert tree.data == 'expansions' return {t for x in tree.find_data('expansion') for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} def _get_parser(): try: return _get_parser.cache except AttributeError: terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] rules = [options_from_rule(name, None, x) for name, x in RULES.items()] rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, _p, xs, o in rules for i, x in enumerate(xs)] callback = ParseTreeBuilder(rules, ST).create_callback() import re lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT', 'BACKSLASH']) parser_conf = ParserConf(rules, callback, ['start']) lexer_conf.lexer_type = 'basic' parser_conf.parser_type = 'lalr' _get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, None) return _get_parser.cache GRAMMAR_ERRORS = [ ('Incorrect type of value', ['a: 1\n']), ('Unclosed parenthesis', ['a: (\n']), ('Unmatched closing parenthesis', ['a: )\n', 'a: [)\n', 'a: (]\n']), ('Expecting rule or terminal definition (missing colon)', ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n']), ('Illegal name for rules or terminals', ['Aa:\n']), ('Alias expects lowercase name', ['a: -> "a"\n']), ('Unexpected colon', ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n']), ('Misplaced operator', ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n']), ('Expecting option ("|") or a new rule or terminal definition', ['a:a\n()\n']), ('Terminal names cannot contain dots', ['A.B\n']), ('Expecting rule or terminal definition', ['"a"\n']), ('%import expects a name', ['%import "a"\n']), ('%ignore expects a value', ['%ignore %import\n']), ] def _translate_parser_exception(parse, e): error = e.match_examples(parse, GRAMMAR_ERRORS, use_accepts=True) if error: return error elif 'STRING' in e.expected: return "Expecting a value" def _parse_grammar(text, name, start='start'): try: tree = _get_parser().parse(text + '\n', start) except UnexpectedCharacters as e: context = e.get_context(text) raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" % (e.line, e.column, name, context)) except UnexpectedToken as e: context = e.get_context(text) error = _translate_parser_exception(_get_parser().parse, e) if error: raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context)) raise return PrepareGrammar().transform(tree) def _error_repr(error): if isinstance(error, UnexpectedToken): error2 = _translate_parser_exception(_get_parser().parse, error) if error2: return error2 expected = ', '.join(error.accepts or error.expected) return "Unexpected token %r. Expected one of: {%s}" % (str(error.token), expected) else: return str(error) def _search_interactive_parser(interactive_parser, predicate): def expand(node): path, p = node for choice in p.choices(): t = Token(choice, '') try: new_p = p.feed_token(t) except ParseError: # Illegal pass else: yield path + (choice,), new_p for path, p in bfs_all_unique([((), interactive_parser)], expand): if predicate(p): return path, p def find_grammar_errors(text: str, start: str='start') -> List[Tuple[UnexpectedInput, str]]: errors = [] def on_error(e): errors.append((e, _error_repr(e))) # recover to a new line token_path, _ = _search_interactive_parser(e.interactive_parser.as_immutable(), lambda p: '_NL' in p.choices()) for token_type in token_path: e.interactive_parser.feed_token(Token(token_type, '')) e.interactive_parser.feed_token(Token('_NL', '\n')) return True _tree = _get_parser().parse(text + '\n', start, on_error=on_error) errors_by_line = classify(errors, lambda e: e[0].line) errors = [el[0] for el in errors_by_line.values()] # already sorted for e in errors: e[0].interactive_parser = None return errors def _get_mangle(prefix, aliases, base_mangle=None): def mangle(s): if s in aliases: s = aliases[s] else: if s[0] == '_': s = '_%s__%s' % (prefix, s[1:]) else: s = '%s__%s' % (prefix, s) if base_mangle is not None: s = base_mangle(s) return s return mangle def _mangle_exp(exp, mangle): if mangle is None: return exp exp = deepcopy(exp) # TODO: is this needed for t in exp.iter_subtrees(): for i, c in enumerate(t.children): if isinstance(c, Token) and c.type in ('RULE', 'TERMINAL'): t.children[i] = Token(c.type, mangle(c.value)) return exp class GrammarBuilder: global_keep_all_tokens: bool import_paths: List[Union[str, Callable]] used_files: Dict[str, str] def __init__(self, global_keep_all_tokens: bool=False, import_paths: Optional[List[Union[str, Callable]]]=None, used_files: Optional[Dict[str, str]]=None) -> None: self.global_keep_all_tokens = global_keep_all_tokens self.import_paths = import_paths or [] self.used_files = used_files or {} self._definitions = {} self._ignore_names = [] def _is_term(self, name): # Imported terminals are of the form `Path__to__Grammar__file__TERMINAL_NAME` # Only the last part is the actual name, and the rest might contain mixed case return name.rpartition('__')[-1].isupper() def _grammar_error(self, msg, *names): args = {} for i, name in enumerate(names, start=1): postfix = '' if i == 1 else str(i) args['name' + postfix] = name args['type' + postfix] = lowercase_type = ("rule", "terminal")[self._is_term(name)] args['Type' + postfix] = lowercase_type.title() raise GrammarError(msg.format(**args)) def _check_options(self, name, options): if self._is_term(name): if options is None: options = 1 elif not isinstance(options, int): raise GrammarError("Terminal require a single int as 'options' (e.g. priority), got %s" % (type(options),)) else: if options is None: options = RuleOptions() elif not isinstance(options, RuleOptions): raise GrammarError("Rules require a RuleOptions instance as 'options'") if self.global_keep_all_tokens: options.keep_all_tokens = True return options def _define(self, name, exp, params=(), options=None, override=False): if name in self._definitions: if not override: self._grammar_error("{Type} '{name}' defined more than once", name) elif override: self._grammar_error("Cannot override a nonexisting {type} {name}", name) if name.startswith('__'): self._grammar_error('Names starting with double-underscore are reserved (Error at {name})', name) self._definitions[name] = (params, exp, self._check_options(name, options)) def _extend(self, name, exp, params=(), options=None): if name not in self._definitions: self._grammar_error("Can't extend {type} {name} as it wasn't defined before", name) if tuple(params) != tuple(self._definitions[name][0]): self._grammar_error("Cannot extend {type} with different parameters: {name}", name) # TODO: think about what to do with 'options' base = self._definitions[name][1] assert isinstance(base, Tree) and base.data == 'expansions' base.children.insert(0, exp) def _ignore(self, exp_or_name): if isinstance(exp_or_name, str): self._ignore_names.append(exp_or_name) else: assert isinstance(exp_or_name, Tree) t = exp_or_name if t.data == 'expansions' and len(t.children) == 1: t2 ,= t.children if t2.data=='expansion' and len(t2.children) == 1: item ,= t2.children if item.data == 'value': item ,= item.children if isinstance(item, Token) and item.type == 'TERMINAL': self._ignore_names.append(item.value) return name = '__IGNORE_%d'% len(self._ignore_names) self._ignore_names.append(name) self._definitions[name] = ((), t, TOKEN_DEFAULT_PRIORITY) def _declare(self, *names): for name in names: self._define(name, None) def _unpack_import(self, stmt, grammar_name): if len(stmt.children) > 1: path_node, arg1 = stmt.children else: path_node, = stmt.children arg1 = None if isinstance(arg1, Tree): # Multi import dotted_path = tuple(path_node.children) names = arg1.children aliases = dict(zip(names, names)) # Can't have aliased multi import, so all aliases will be the same as names else: # Single import dotted_path = tuple(path_node.children[:-1]) if not dotted_path: name ,= path_node.children raise GrammarError("Nothing was imported from grammar `%s`" % name) name = path_node.children[-1] # Get name from dotted path aliases = {name.value: (arg1 or name).value} # Aliases if exist if path_node.data == 'import_lib': # Import from library base_path = None else: # Relative import if grammar_name == '': # Import relative to script file path if grammar is coded in script try: base_file = os.path.abspath(sys.modules['__main__'].__file__) except AttributeError: base_file = None else: base_file = grammar_name # Import relative to grammar file path if external grammar file if base_file: if isinstance(base_file, PackageResource): base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0]) else: base_path = os.path.split(base_file)[0] else: base_path = os.path.abspath(os.path.curdir) return dotted_path, base_path, aliases def _unpack_definition(self, tree, mangle): if tree.data == 'rule': name, params, exp, opts = options_from_rule(*tree.children) else: name = tree.children[0].value params = () # TODO terminal templates opts = int(tree.children[1]) if len(tree.children) == 3 else TOKEN_DEFAULT_PRIORITY # priority exp = tree.children[-1] if mangle is not None: params = tuple(mangle(p) for p in params) name = mangle(name) exp = _mangle_exp(exp, mangle) return name, exp, params, opts def load_grammar(self, grammar_text: str, grammar_name: str="", mangle: Optional[Callable[[str], str]]=None) -> None: tree = _parse_grammar(grammar_text, grammar_name) imports = {} for stmt in tree.children: if stmt.data == 'import': dotted_path, base_path, aliases = self._unpack_import(stmt, grammar_name) try: import_base_path, import_aliases = imports[dotted_path] assert base_path == import_base_path, 'Inconsistent base_path for %s.' % '.'.join(dotted_path) import_aliases.update(aliases) except KeyError: imports[dotted_path] = base_path, aliases for dotted_path, (base_path, aliases) in imports.items(): self.do_import(dotted_path, base_path, aliases, mangle) for stmt in tree.children: if stmt.data in ('term', 'rule'): self._define(*self._unpack_definition(stmt, mangle)) elif stmt.data == 'override': r ,= stmt.children self._define(*self._unpack_definition(r, mangle), override=True) elif stmt.data == 'extend': r ,= stmt.children self._extend(*self._unpack_definition(r, mangle)) elif stmt.data == 'ignore': # if mangle is not None, we shouldn't apply ignore, since we aren't in a toplevel grammar if mangle is None: self._ignore(*stmt.children) elif stmt.data == 'declare': names = [t.value for t in stmt.children] if mangle is None: self._declare(*names) else: self._declare(*map(mangle, names)) elif stmt.data == 'import': pass else: assert False, stmt term_defs = { name: exp for name, (_params, exp, _options) in self._definitions.items() if self._is_term(name) } resolve_term_references(term_defs) def _remove_unused(self, used): def rule_dependencies(symbol): if self._is_term(symbol): return [] try: params, tree,_ = self._definitions[symbol] except KeyError: return [] return _find_used_symbols(tree) - set(params) _used = set(bfs(used, rule_dependencies)) self._definitions = {k: v for k, v in self._definitions.items() if k in _used} def do_import(self, dotted_path: Tuple[str, ...], base_path: Optional[str], aliases: Dict[str, str], base_mangle: Optional[Callable[[str], str]]=None) -> None: assert dotted_path mangle = _get_mangle('__'.join(dotted_path), aliases, base_mangle) grammar_path = os.path.join(*dotted_path) + EXT to_try = self.import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader] for source in to_try: try: if callable(source): joined_path, text = source(base_path, grammar_path) else: joined_path = os.path.join(source, grammar_path) with open(joined_path, encoding='utf8') as f: text = f.read() except IOError: continue else: h = hashlib.md5(text.encode('utf8')).hexdigest() if self.used_files.get(joined_path, h) != h: raise RuntimeError("Grammar file was changed during importing") self.used_files[joined_path] = h gb = GrammarBuilder(self.global_keep_all_tokens, self.import_paths, self.used_files) gb.load_grammar(text, joined_path, mangle) gb._remove_unused(map(mangle, aliases)) for name in gb._definitions: if name in self._definitions: raise GrammarError("Cannot import '%s' from '%s': Symbol already defined." % (name, grammar_path)) self._definitions.update(**gb._definitions) break else: # Search failed. Make Python throw a nice error. open(grammar_path, encoding='utf8') assert False, "Couldn't import grammar %s, but a corresponding file was found at a place where lark doesn't search for it" % (dotted_path,) def validate(self) -> None: for name, (params, exp, _options) in self._definitions.items(): for i, p in enumerate(params): if p in self._definitions: raise GrammarError("Template Parameter conflicts with rule %s (in template %s)" % (p, name)) if p in params[:i]: raise GrammarError("Duplicate Template Parameter %s (in template %s)" % (p, name)) if exp is None: # Remaining checks don't apply to abstract rules/terminals continue for temp in exp.find_data('template_usage'): sym = temp.children[0] args = temp.children[1:] if sym not in params: if sym not in self._definitions: self._grammar_error("Template '%s' used but not defined (in {type} {name})" % sym, name) if len(args) != len(self._definitions[sym][0]): expected, actual = len(self._definitions[sym][0]), len(args) self._grammar_error("Wrong number of template arguments used for {name} " "(expected %s, got %s) (in {type2} {name2})" % (expected, actual), sym, name) for sym in _find_used_symbols(exp): if sym not in self._definitions and sym not in params: self._grammar_error("{Type} '{name}' used but not defined (in {type2} {name2})", sym, name) if not set(self._definitions).issuperset(self._ignore_names): raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(self._ignore_names) - set(self._definitions))) def build(self) -> Grammar: self.validate() rule_defs = [] term_defs = [] for name, (params, exp, options) in self._definitions.items(): if self._is_term(name): assert len(params) == 0 term_defs.append((name, (exp, options))) else: rule_defs.append((name, params, exp, options)) # resolve_term_references(term_defs) return Grammar(rule_defs, term_defs, self._ignore_names) def verify_used_files(file_hashes): for path, old in file_hashes.items(): text = None if isinstance(path, str) and os.path.exists(path): with open(path, encoding='utf8') as f: text = f.read() elif isinstance(path, PackageResource): with suppress(IOError): text = pkgutil.get_data(*path).decode('utf-8') if text is None: # We don't know how to load the path. ignore it. continue current = hashlib.md5(text.encode()).hexdigest() if old != current: logger.info("File %r changed, rebuilding Parser" % path) return False return True def list_grammar_imports(grammar, import_paths=[]): "Returns a list of paths to the lark grammars imported by the given grammar (recursively)" builder = GrammarBuilder(False, import_paths) builder.load_grammar(grammar, '') return list(builder.used_files.keys()) def load_grammar(grammar, source, import_paths, global_keep_all_tokens): builder = GrammarBuilder(global_keep_all_tokens, import_paths) builder.load_grammar(grammar, source) return builder.build(), builder.used_files