diff --git a/examples/lark.lark b/examples/lark.lark index 8a5ac66..fef6b07 100644 --- a/examples/lark.lark +++ b/examples/lark.lark @@ -4,8 +4,11 @@ _item: rule | token | statement -rule: RULE priority? ":" expansions _NL -token: TOKEN priority? ":" expansions _NL +rule: RULE rule_params priority? ":" expansions _NL +token: TOKEN token_params priority? ":" expansions _NL + +rule_params: ["{" RULE ("," RULE)* "}"] +token_params: ["{" TOKEN ("," TOKEN)* "}"] priority: "." NUMBER @@ -27,9 +30,12 @@ name_list: "(" name ("," name)* ")" ?atom: "(" expansions ")" | "[" expansions "]" -> maybe - | STRING ".." STRING -> literal_range - | name - | (REGEXP | STRING) -> literal + | value + +?value: STRING ".." STRING -> literal_range + | name + | (REGEXP | STRING) -> literal + | name "{" value ("," value)* "}" -> template_usage name: RULE | TOKEN diff --git a/examples/template_lark.lark b/examples/template_lark.lark new file mode 100644 index 0000000..296407f --- /dev/null +++ b/examples/template_lark.lark @@ -0,0 +1,56 @@ +start: (_item | _NL)* + +_item: rule + | token + | statement + +_rule_or_token: RULE + | TOKEN +rule: RULE rule_params priority? ":" expansions{_rule_or_token} _NL +token: TOKEN priority? ":" expansions{TOKEN} _NL + +rule_params: ["{" RULE ("," RULE)* "}"] + +priority: "." NUMBER + +statement: "%ignore" expansions{TOKEN} _NL -> ignore + | "%import" import_path{_rule_or_token} ["->" _rule_or_token] _NL -> import + | "%import" import_path{_rule_or_token} name_list{_rule_or_token} _NL -> multi_import + | "%declare" TOKEN+ -> declare + +!import_path{name}: "."? name ("." name)* +name_list{name}: "(" name ("," name)* ")" + +?expansions{name}: alias{name} (_VBAR alias{name})* + +?alias{name}: expansion{name} ["->" RULE] + +?expansion{name}: expr{name}* + +?expr{name}: atom{name} [OP | "~" NUMBER [".." NUMBER]] + +?atom{name}: "(" expansions{name} ")" + | "[" expansions{name} "]" -> maybe + | value{name} + +?value{name}: STRING ".." STRING -> literal_range + | name + | (REGEXP | STRING) -> literal + | name "{" value{name} ("," value{name})* "}" -> template_usage + +_VBAR: _NL? "|" +OP: /[+*]|[?](?![a-z])/ +RULE: /!?[_?]?[a-z][_a-z0-9]*/ +TOKEN: /_?[A-Z][_A-Z0-9]*/ +STRING: _STRING "i"? +REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/\n])*?\/[imslux]*/ +_NL: /(\r?\n)+\s*/ + +%import common.ESCAPED_STRING -> _STRING +%import common.INT -> NUMBER +%import common.WS_INLINE + +COMMENT: /\s*/ "//" /[^\n]/* + +%ignore WS_INLINE +%ignore COMMENT diff --git a/lark/grammar.py b/lark/grammar.py index cf8cf64..bb84351 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -49,19 +49,21 @@ class NonTerminal(Symbol): class RuleOptions(Serialize): - __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'empty_indices' + __serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'template_source', 'empty_indices' - def __init__(self, keep_all_tokens=False, expand1=False, priority=None, empty_indices=()): + def __init__(self, keep_all_tokens=False, expand1=False, priority=None, template_source=None, empty_indices=()): self.keep_all_tokens = keep_all_tokens self.expand1 = expand1 self.priority = priority + self.template_source = template_source self.empty_indices = empty_indices def __repr__(self): - return 'RuleOptions(%r, %r, %r)' % ( + return 'RuleOptions(%r, %r, %r, %r)' % ( self.keep_all_tokens, self.expand1, self.priority, + self.template_source ) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 051f8cd..3fe10a3 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -73,6 +73,8 @@ TERMINALS = { '_RPAR': r'\)', '_LBRA': r'\[', '_RBRA': r'\]', + '_LBRACE': r'\{', + '_RBRACE': r'\}', 'OP': '[+*]|[?](?![a-z])', '_COLON': ':', '_COMMA': ',', @@ -99,8 +101,12 @@ RULES = { '_list': ['_item', '_list _item'], '_item': ['rule', 'term', 'statement', '_NL'], - 'rule': ['RULE _COLON expansions _NL', - 'RULE _DOT NUMBER _COLON expansions _NL'], + 'rule': ['RULE template_params _COLON expansions _NL', + 'RULE template_params _DOT NUMBER _COLON expansions _NL'], + 'template_params': ['_LBRACE _template_params _RBRACE', + ''], + '_template_params': ['RULE', + '_template_params _COMMA RULE'], 'expansions': ['alias', 'expansions _OR alias', 'expansions _NL _OR alias'], @@ -123,7 +129,8 @@ RULES = { 'value': ['terminal', 'nonterminal', 'literal', - 'range'], + 'range', + 'template_usage'], 'terminal': ['TERMINAL'], 'nonterminal': ['RULE'], @@ -132,9 +139,13 @@ RULES = { 'maybe': ['_LBRA expansions _RBRA'], 'range': ['STRING _DOTDOT STRING'], + + 'template_usage': ['RULE _LBRACE _template_args _RBRACE'], + '_template_args': ['value', + '_template_args _COMMA value'], 'term': ['TERMINAL _COLON expansions _NL', - 'TERMINAL _DOT NUMBER _COLON expansions _NL'], + 'TERMINAL _DOT NUMBER _COLON expansions _NL'], 'statement': ['ignore', 'import', 'declare'], 'ignore': ['_IGNORE expansions _NL'], 'declare': ['_DECLARE _declare_args _NL'], @@ -340,6 +351,39 @@ class PrepareAnonTerminals(Transformer_InPlace): return Terminal(term_name, filter_out=isinstance(p, PatternStr)) +class _ReplaceSymbols(Transformer_InPlace): + " Helper for ApplyTemplates " + + def __init__(self): + self.names = {} + + def value(self, c): + if len(c) == 1 and isinstance(c[0], Token) and c[0].type == 'RULE' and c[0].value in self.names: + return self.names[c[0].value] + return self.__default__('value', c, None) + +class ApplyTemplates(Transformer_InPlace): + " Apply the templates, creating new rules that represent the used templates " + + def __init__(self, rule_defs): + self.rule_defs = rule_defs + self.replacer = _ReplaceSymbols() + self.created_templates = set() + + def template_usage(self, c): + name = c[0] + args = c[1:] + result_name = "%s{%s}" % (name, ",".join(a.name for a in args)) + if result_name not in self.created_templates: + self.created_templates.add(result_name) + (_n, params, tree, options) ,= (t for t in self.rule_defs if t[0] == name) + assert len(params) == len(args), args + result_tree = deepcopy(tree) + self.replacer.names = dict(zip(params, args)) + self.replacer.transform(result_tree) + self.rule_defs.append((result_name, [], result_tree, deepcopy(options))) + return NonTerminal(result_name) + def _rfind(s, choices): return max(s.rfind(c) for c in choices) @@ -465,19 +509,28 @@ class Grammar: transformer = PrepareLiterals() * TerminalTreeToPattern() terminals = [TerminalDef(name, transformer.transform( term_tree ), priority) - for name, (term_tree, priority) in term_defs if term_tree] + for name, (term_tree, priority) in term_defs if term_tree] # ================= # Compile Rules # ================= # 1. Pre-process terminals - transformer = PrepareLiterals() * PrepareSymbols() * PrepareAnonTerminals(terminals) # Adds to terminals + transformer = PrepareLiterals() * PrepareSymbols() * PrepareAnonTerminals(terminals) # Adds to terminals + + # 2. Inline Templates - # 2. Convert EBNF to BNF (and apply step 1) + transformer *= ApplyTemplates(rule_defs) + + # 3. Convert EBNF to BNF (and apply step 1 & 2) ebnf_to_bnf = EBNF_to_BNF() rules = [] - for name, rule_tree, options in rule_defs: + i = 0 + while i < len(rule_defs): # We have to do it like this because rule_defs might grow due to templates + name, params, rule_tree, options = rule_defs[i] + i += 1 + if len(params) != 0: # Dont transform templates + continue ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options.keep_all_tokens else None ebnf_to_bnf.prefix = name tree = transformer.transform(rule_tree) @@ -487,7 +540,7 @@ class Grammar: assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision" - # 3. Compile tree to Rule objects + # 4. Compile tree to Rule objects rule_tree_to_text = RuleTreeToText() simplify_rule = SimplifyRule_Visitor() @@ -575,7 +628,7 @@ def import_from_grammar_into_namespace(grammar, namespace, aliases): """ imported_terms = dict(grammar.term_defs) - imported_rules = {n:(n,deepcopy(t),o) for n,t,o in grammar.rule_defs} + imported_rules = {n:(n,p,deepcopy(t),o) for n,p,t,o in grammar.rule_defs} term_defs = [] rule_defs = [] @@ -584,14 +637,19 @@ def import_from_grammar_into_namespace(grammar, namespace, aliases): if symbol.type != 'RULE': return [] try: - _, tree, _ = imported_rules[symbol] + _, params, tree,_ = imported_rules[symbol] except KeyError: raise GrammarError("Missing symbol '%s' in grammar %s" % (symbol, namespace)) + return _find_used_symbols(tree) - set(params) - return _find_used_symbols(tree) - def get_namespace_name(name): + def get_namespace_name(name, params): + if params is not None: + try: + return params[name] + except KeyError: + pass try: return aliases[name].value except KeyError: @@ -602,15 +660,18 @@ def import_from_grammar_into_namespace(grammar, namespace, aliases): to_import = list(bfs(aliases, rule_dependencies)) for symbol in to_import: if symbol.type == 'TERMINAL': - term_defs.append([get_namespace_name(symbol), imported_terms[symbol]]) + term_defs.append([get_namespace_name(symbol, None), imported_terms[symbol]]) else: assert symbol.type == 'RULE' - rule = imported_rules[symbol] - for t in rule[1].iter_subtrees(): + _, params, tree, options = imported_rules[symbol] + params_map = {p: ('%s__%s' if p[0]!='_' else '_%s__%s' ) % (namespace, p) for p in params} + for t in tree.iter_subtrees(): for i, c in enumerate(t.children): if isinstance(c, Token) and c.type in ('RULE', 'TERMINAL'): - t.children[i] = Token(c.type, get_namespace_name(c)) - rule_defs.append((get_namespace_name(symbol), rule[1], rule[2])) + t.children[i] = Token(c.type, get_namespace_name(c, params_map)) + params = [params_map[p] for p in params] # We can not rely on ordered dictionaries + rule_defs.append((get_namespace_name(symbol, params_map), params, tree, options)) + return term_defs, rule_defs @@ -648,20 +709,22 @@ def resolve_term_references(term_defs): raise GrammarError("Recursion in terminal '%s' (recursion is only allowed in rules, not terminals)" % name) -def options_from_rule(name, *x): +def options_from_rule(name, params, *x): if len(x) > 1: priority, expansions = x priority = int(priority) else: expansions ,= x priority = None + params = [t.value for t in params.children] if params is not None else [] # For the grammar parser keep_all_tokens = name.startswith('!') name = name.lstrip('!') expand1 = name.startswith('?') name = name.lstrip('?') - return name, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority) + return name, params, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority, + template_source=(name if params else None)) def symbols_from_strcase(expansion): @@ -684,8 +747,8 @@ class GrammarLoader: def __init__(self): terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] - rules = [options_from_rule(name, x) for name, x in RULES.items()] - rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, xs, o in rules for i, x in enumerate(xs)] + rules = [options_from_rule(name, None, x) for name, x in RULES.items()] + rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, _p, xs, o in rules for i, x in enumerate(xs)] callback = ParseTreeBuilder(rules, ST).create_callback() lexer_conf = LexerConf(terminals, ['WS', 'COMMENT']) @@ -832,23 +895,37 @@ class GrammarLoader: rules = rule_defs - rule_names = set() - for name, _x, _o in rules: + rule_names = {} + for name, params, _x, _o in rules: if name.startswith('__'): raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) if name in rule_names: raise GrammarError("Rule '%s' defined more than once" % name) - rule_names.add(name) - - for name, expansions, _o in rules: + rule_names[name] = len(params) + + for name, params , expansions, _o in rules: + for i, p in enumerate(params): + if p in rule_names: + raise GrammarError("Template Parameter conflicts with rule %s (in template %s)" % (p, name)) + if p in params[:i]: + raise GrammarError("Duplicate Template Parameter %s (in template %s)" % (p, name)) + for temp in expansions.find_data('template_usage'): + sym = temp.children[0] + args = temp.children[1:] + if sym not in rule_names: + raise GrammarError("Template '%s' used but not defined (in rule %s)" % (sym, name)) + if len(args) != rule_names[sym]: + raise GrammarError("Wrong number of template arguments used for %s " + "(expected %s, got %s) (in rule %s)"%(sym, rule_names[sym], len(args), name)) for sym in _find_used_symbols(expansions): if sym.type == 'TERMINAL': if sym not in terminal_names: raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name)) else: - if sym not in rule_names: + if sym not in rule_names and sym not in params: raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name)) + return Grammar(rules, term_defs, ignore_names) diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 11c7fac..904d186 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -243,7 +243,7 @@ class ParseTreeBuilder: for rule, wrapper_chain in self.rule_builders: - user_callback_name = rule.alias or rule.origin.name + user_callback_name = rule.alias or rule.options.template_source or rule.origin.name try: f = getattr(transformer, user_callback_name) # XXX InlineTransformer is deprecated! diff --git a/tests/grammars/templates.lark b/tests/grammars/templates.lark new file mode 100644 index 0000000..1631188 --- /dev/null +++ b/tests/grammars/templates.lark @@ -0,0 +1 @@ +sep{item, delim}: item (delim item)* \ No newline at end of file diff --git a/tests/test_parser.py b/tests/test_parser.py index a2275ad..6b9df3f 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -822,7 +822,6 @@ def _make_parser_test(LEXER, PARSER): x = g.parse('Hello HelloWorld') self.assertSequenceEqual(x.children, ['HelloWorld']) - def test_token_collision2(self): g = _Lark(""" !start: "starts" @@ -832,7 +831,70 @@ def _make_parser_test(LEXER, PARSER): x = g.parse("starts") self.assertSequenceEqual(x.children, ['starts']) - + + def test_templates(self): + g = _Lark(r""" + start: "[" sep{NUMBER, ","} "]" + sep{item, delim}: item (delim item)* + NUMBER: /\d+/ + %ignore " " + """) + x = g.parse("[1, 2, 3, 4]") + self.assertSequenceEqual(x.children, [Tree('sep', ['1', '2', '3', '4'])]) + x = g.parse("[1]") + self.assertSequenceEqual(x.children, [Tree('sep', ['1'])]) + + def test_templates_recursion(self): + g = _Lark(r""" + start: "[" _sep{NUMBER, ","} "]" + _sep{item, delim}: item | _sep{item, delim} delim item + NUMBER: /\d+/ + %ignore " " + """) + x = g.parse("[1, 2, 3, 4]") + self.assertSequenceEqual(x.children, ['1', '2', '3', '4']) + x = g.parse("[1]") + self.assertSequenceEqual(x.children, ['1']) + + def test_templates_import(self): + g = _Lark_open("test_templates_import.lark", rel_to=__file__) + x = g.parse("[1, 2, 3, 4]") + self.assertSequenceEqual(x.children, [Tree('sep', ['1', '2', '3', '4'])]) + x = g.parse("[1]") + self.assertSequenceEqual(x.children, [Tree('sep', ['1'])]) + + def test_templates_alias(self): + g = _Lark(r""" + start: expr{"C"} + expr{t}: "A" t + | "B" t -> b + """) + x = g.parse("AC") + self.assertSequenceEqual(x.children, [Tree('expr', [])]) + x = g.parse("BC") + self.assertSequenceEqual(x.children, [Tree('b', [])]) + + def test_templates_modifiers(self): + g = _Lark(r""" + start: expr{"B"} + !expr{t}: "A" t + """) + x = g.parse("AB") + self.assertSequenceEqual(x.children, [Tree('expr', ["A", "B"])]) + g = _Lark(r""" + start: _expr{"B"} + !_expr{t}: "A" t + """) + x = g.parse("AB") + self.assertSequenceEqual(x.children, ["A", "B"]) + g = _Lark(r""" + start: expr{b} + b: "B" + ?expr{t}: "A" t + """) + x = g.parse("AB") + self.assertSequenceEqual(x.children, [Tree('b',[])]) + def test_g_regex_flags(self): g = _Lark(""" start: "a" /b+/ C diff --git a/tests/test_templates_import.lark b/tests/test_templates_import.lark new file mode 100644 index 0000000..a1272b8 --- /dev/null +++ b/tests/test_templates_import.lark @@ -0,0 +1,4 @@ +start: "[" sep{NUMBER, ","} "]" +NUMBER: /\d+/ +%ignore " " +%import .grammars.templates.sep \ No newline at end of file