Browse Source

Merge branch 'template_rules' of https://github.com/MegaIng/lark

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.8.6
Erez Sh 5 years ago
parent
commit
2d1a058f0d
8 changed files with 247 additions and 39 deletions
  1. +11
    -5
      examples/lark.lark
  2. +56
    -0
      examples/template_lark.lark
  3. +5
    -3
      lark/grammar.py
  4. +105
    -28
      lark/load_grammar.py
  5. +1
    -1
      lark/parse_tree_builder.py
  6. +1
    -0
      tests/grammars/templates.lark
  7. +64
    -2
      tests/test_parser.py
  8. +4
    -0
      tests/test_templates_import.lark

+ 11
- 5
examples/lark.lark View File

@@ -4,8 +4,11 @@ _item: rule
| token
| statement

rule: RULE priority? ":" expansions _NL
token: TOKEN priority? ":" expansions _NL
rule: RULE rule_params priority? ":" expansions _NL
token: TOKEN token_params priority? ":" expansions _NL

rule_params: ["{" RULE ("," RULE)* "}"]
token_params: ["{" TOKEN ("," TOKEN)* "}"]

priority: "." NUMBER

@@ -27,9 +30,12 @@ name_list: "(" name ("," name)* ")"

?atom: "(" expansions ")"
| "[" expansions "]" -> maybe
| STRING ".." STRING -> literal_range
| name
| (REGEXP | STRING) -> literal
| value

?value: STRING ".." STRING -> literal_range
| name
| (REGEXP | STRING) -> literal
| name "{" value ("," value)* "}" -> template_usage

name: RULE
| TOKEN


+ 56
- 0
examples/template_lark.lark View File

@@ -0,0 +1,56 @@
start: (_item | _NL)*

_item: rule
| token
| statement

_rule_or_token: RULE
| TOKEN
rule: RULE rule_params priority? ":" expansions{_rule_or_token} _NL
token: TOKEN priority? ":" expansions{TOKEN} _NL

rule_params: ["{" RULE ("," RULE)* "}"]

priority: "." NUMBER

statement: "%ignore" expansions{TOKEN} _NL -> ignore
| "%import" import_path{_rule_or_token} ["->" _rule_or_token] _NL -> import
| "%import" import_path{_rule_or_token} name_list{_rule_or_token} _NL -> multi_import
| "%declare" TOKEN+ -> declare

!import_path{name}: "."? name ("." name)*
name_list{name}: "(" name ("," name)* ")"

?expansions{name}: alias{name} (_VBAR alias{name})*

?alias{name}: expansion{name} ["->" RULE]

?expansion{name}: expr{name}*

?expr{name}: atom{name} [OP | "~" NUMBER [".." NUMBER]]

?atom{name}: "(" expansions{name} ")"
| "[" expansions{name} "]" -> maybe
| value{name}

?value{name}: STRING ".." STRING -> literal_range
| name
| (REGEXP | STRING) -> literal
| name "{" value{name} ("," value{name})* "}" -> template_usage

_VBAR: _NL? "|"
OP: /[+*]|[?](?![a-z])/
RULE: /!?[_?]?[a-z][_a-z0-9]*/
TOKEN: /_?[A-Z][_A-Z0-9]*/
STRING: _STRING "i"?
REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/\n])*?\/[imslux]*/
_NL: /(\r?\n)+\s*/

%import common.ESCAPED_STRING -> _STRING
%import common.INT -> NUMBER
%import common.WS_INLINE

COMMENT: /\s*/ "//" /[^\n]/*

%ignore WS_INLINE
%ignore COMMENT

+ 5
- 3
lark/grammar.py View File

@@ -49,19 +49,21 @@ class NonTerminal(Symbol):


class RuleOptions(Serialize):
__serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'empty_indices'
__serialize_fields__ = 'keep_all_tokens', 'expand1', 'priority', 'template_source', 'empty_indices'

def __init__(self, keep_all_tokens=False, expand1=False, priority=None, empty_indices=()):
def __init__(self, keep_all_tokens=False, expand1=False, priority=None, template_source=None, empty_indices=()):
self.keep_all_tokens = keep_all_tokens
self.expand1 = expand1
self.priority = priority
self.template_source = template_source
self.empty_indices = empty_indices

def __repr__(self):
return 'RuleOptions(%r, %r, %r)' % (
return 'RuleOptions(%r, %r, %r, %r)' % (
self.keep_all_tokens,
self.expand1,
self.priority,
self.template_source
)




+ 105
- 28
lark/load_grammar.py View File

@@ -73,6 +73,8 @@ TERMINALS = {
'_RPAR': r'\)',
'_LBRA': r'\[',
'_RBRA': r'\]',
'_LBRACE': r'\{',
'_RBRACE': r'\}',
'OP': '[+*]|[?](?![a-z])',
'_COLON': ':',
'_COMMA': ',',
@@ -99,8 +101,12 @@ RULES = {
'_list': ['_item', '_list _item'],
'_item': ['rule', 'term', 'statement', '_NL'],

'rule': ['RULE _COLON expansions _NL',
'RULE _DOT NUMBER _COLON expansions _NL'],
'rule': ['RULE template_params _COLON expansions _NL',
'RULE template_params _DOT NUMBER _COLON expansions _NL'],
'template_params': ['_LBRACE _template_params _RBRACE',
''],
'_template_params': ['RULE',
'_template_params _COMMA RULE'],
'expansions': ['alias',
'expansions _OR alias',
'expansions _NL _OR alias'],
@@ -123,7 +129,8 @@ RULES = {
'value': ['terminal',
'nonterminal',
'literal',
'range'],
'range',
'template_usage'],

'terminal': ['TERMINAL'],
'nonterminal': ['RULE'],
@@ -132,9 +139,13 @@ RULES = {

'maybe': ['_LBRA expansions _RBRA'],
'range': ['STRING _DOTDOT STRING'],
'template_usage': ['RULE _LBRACE _template_args _RBRACE'],
'_template_args': ['value',
'_template_args _COMMA value'],

'term': ['TERMINAL _COLON expansions _NL',
'TERMINAL _DOT NUMBER _COLON expansions _NL'],
'TERMINAL _DOT NUMBER _COLON expansions _NL'],
'statement': ['ignore', 'import', 'declare'],
'ignore': ['_IGNORE expansions _NL'],
'declare': ['_DECLARE _declare_args _NL'],
@@ -340,6 +351,39 @@ class PrepareAnonTerminals(Transformer_InPlace):

return Terminal(term_name, filter_out=isinstance(p, PatternStr))

class _ReplaceSymbols(Transformer_InPlace):
" Helper for ApplyTemplates "
def __init__(self):
self.names = {}

def value(self, c):
if len(c) == 1 and isinstance(c[0], Token) and c[0].type == 'RULE' and c[0].value in self.names:
return self.names[c[0].value]
return self.__default__('value', c, None)

class ApplyTemplates(Transformer_InPlace):
" Apply the templates, creating new rules that represent the used templates "

def __init__(self, rule_defs):
self.rule_defs = rule_defs
self.replacer = _ReplaceSymbols()
self.created_templates = set()
def template_usage(self, c):
name = c[0]
args = c[1:]
result_name = "%s{%s}" % (name, ",".join(a.name for a in args))
if result_name not in self.created_templates:
self.created_templates.add(result_name)
(_n, params, tree, options) ,= (t for t in self.rule_defs if t[0] == name)
assert len(params) == len(args), args
result_tree = deepcopy(tree)
self.replacer.names = dict(zip(params, args))
self.replacer.transform(result_tree)
self.rule_defs.append((result_name, [], result_tree, deepcopy(options)))
return NonTerminal(result_name)


def _rfind(s, choices):
return max(s.rfind(c) for c in choices)
@@ -465,19 +509,28 @@ class Grammar:

transformer = PrepareLiterals() * TerminalTreeToPattern()
terminals = [TerminalDef(name, transformer.transform( term_tree ), priority)
for name, (term_tree, priority) in term_defs if term_tree]
for name, (term_tree, priority) in term_defs if term_tree]

# =================
# Compile Rules
# =================

# 1. Pre-process terminals
transformer = PrepareLiterals() * PrepareSymbols() * PrepareAnonTerminals(terminals) # Adds to terminals
transformer = PrepareLiterals() * PrepareSymbols() * PrepareAnonTerminals(terminals) # Adds to terminals

# 2. Inline Templates

# 2. Convert EBNF to BNF (and apply step 1)
transformer *= ApplyTemplates(rule_defs)

# 3. Convert EBNF to BNF (and apply step 1 & 2)
ebnf_to_bnf = EBNF_to_BNF()
rules = []
for name, rule_tree, options in rule_defs:
i = 0
while i < len(rule_defs): # We have to do it like this because rule_defs might grow due to templates
name, params, rule_tree, options = rule_defs[i]
i += 1
if len(params) != 0: # Dont transform templates
continue
ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options.keep_all_tokens else None
ebnf_to_bnf.prefix = name
tree = transformer.transform(rule_tree)
@@ -487,7 +540,7 @@ class Grammar:

assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision"

# 3. Compile tree to Rule objects
# 4. Compile tree to Rule objects
rule_tree_to_text = RuleTreeToText()

simplify_rule = SimplifyRule_Visitor()
@@ -575,7 +628,7 @@ def import_from_grammar_into_namespace(grammar, namespace, aliases):
"""

imported_terms = dict(grammar.term_defs)
imported_rules = {n:(n,deepcopy(t),o) for n,t,o in grammar.rule_defs}
imported_rules = {n:(n,p,deepcopy(t),o) for n,p,t,o in grammar.rule_defs}

term_defs = []
rule_defs = []
@@ -584,14 +637,19 @@ def import_from_grammar_into_namespace(grammar, namespace, aliases):
if symbol.type != 'RULE':
return []
try:
_, tree, _ = imported_rules[symbol]
_, params, tree,_ = imported_rules[symbol]
except KeyError:
raise GrammarError("Missing symbol '%s' in grammar %s" % (symbol, namespace))
return _find_used_symbols(tree) - set(params)

return _find_used_symbols(tree)


def get_namespace_name(name):
def get_namespace_name(name, params):
if params is not None:
try:
return params[name]
except KeyError:
pass
try:
return aliases[name].value
except KeyError:
@@ -602,15 +660,18 @@ def import_from_grammar_into_namespace(grammar, namespace, aliases):
to_import = list(bfs(aliases, rule_dependencies))
for symbol in to_import:
if symbol.type == 'TERMINAL':
term_defs.append([get_namespace_name(symbol), imported_terms[symbol]])
term_defs.append([get_namespace_name(symbol, None), imported_terms[symbol]])
else:
assert symbol.type == 'RULE'
rule = imported_rules[symbol]
for t in rule[1].iter_subtrees():
_, params, tree, options = imported_rules[symbol]
params_map = {p: ('%s__%s' if p[0]!='_' else '_%s__%s' ) % (namespace, p) for p in params}
for t in tree.iter_subtrees():
for i, c in enumerate(t.children):
if isinstance(c, Token) and c.type in ('RULE', 'TERMINAL'):
t.children[i] = Token(c.type, get_namespace_name(c))
rule_defs.append((get_namespace_name(symbol), rule[1], rule[2]))
t.children[i] = Token(c.type, get_namespace_name(c, params_map))
params = [params_map[p] for p in params] # We can not rely on ordered dictionaries
rule_defs.append((get_namespace_name(symbol, params_map), params, tree, options))


return term_defs, rule_defs

@@ -648,20 +709,22 @@ def resolve_term_references(term_defs):
raise GrammarError("Recursion in terminal '%s' (recursion is only allowed in rules, not terminals)" % name)


def options_from_rule(name, *x):
def options_from_rule(name, params, *x):
if len(x) > 1:
priority, expansions = x
priority = int(priority)
else:
expansions ,= x
priority = None
params = [t.value for t in params.children] if params is not None else [] # For the grammar parser

keep_all_tokens = name.startswith('!')
name = name.lstrip('!')
expand1 = name.startswith('?')
name = name.lstrip('?')

return name, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority)
return name, params, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority,
template_source=(name if params else None))


def symbols_from_strcase(expansion):
@@ -684,8 +747,8 @@ class GrammarLoader:
def __init__(self):
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]

rules = [options_from_rule(name, x) for name, x in RULES.items()]
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, xs, o in rules for i, x in enumerate(xs)]
rules = [options_from_rule(name, None, x) for name, x in RULES.items()]
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, _p, xs, o in rules for i, x in enumerate(xs)]
callback = ParseTreeBuilder(rules, ST).create_callback()
lexer_conf = LexerConf(terminals, ['WS', 'COMMENT'])

@@ -832,23 +895,37 @@ class GrammarLoader:

rules = rule_defs

rule_names = set()
for name, _x, _o in rules:
rule_names = {}
for name, params, _x, _o in rules:
if name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
if name in rule_names:
raise GrammarError("Rule '%s' defined more than once" % name)
rule_names.add(name)

for name, expansions, _o in rules:
rule_names[name] = len(params)

for name, params , expansions, _o in rules:
for i, p in enumerate(params):
if p in rule_names:
raise GrammarError("Template Parameter conflicts with rule %s (in template %s)" % (p, name))
if p in params[:i]:
raise GrammarError("Duplicate Template Parameter %s (in template %s)" % (p, name))
for temp in expansions.find_data('template_usage'):
sym = temp.children[0]
args = temp.children[1:]
if sym not in rule_names:
raise GrammarError("Template '%s' used but not defined (in rule %s)" % (sym, name))
if len(args) != rule_names[sym]:
raise GrammarError("Wrong number of template arguments used for %s "
"(expected %s, got %s) (in rule %s)"%(sym, rule_names[sym], len(args), name))
for sym in _find_used_symbols(expansions):
if sym.type == 'TERMINAL':
if sym not in terminal_names:
raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name))
else:
if sym not in rule_names:
if sym not in rule_names and sym not in params:
raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name))


return Grammar(rules, term_defs, ignore_names)




+ 1
- 1
lark/parse_tree_builder.py View File

@@ -243,7 +243,7 @@ class ParseTreeBuilder:

for rule, wrapper_chain in self.rule_builders:

user_callback_name = rule.alias or rule.origin.name
user_callback_name = rule.alias or rule.options.template_source or rule.origin.name
try:
f = getattr(transformer, user_callback_name)
# XXX InlineTransformer is deprecated!


+ 1
- 0
tests/grammars/templates.lark View File

@@ -0,0 +1 @@
sep{item, delim}: item (delim item)*

+ 64
- 2
tests/test_parser.py View File

@@ -822,7 +822,6 @@ def _make_parser_test(LEXER, PARSER):
x = g.parse('Hello HelloWorld')
self.assertSequenceEqual(x.children, ['HelloWorld'])


def test_token_collision2(self):
g = _Lark("""
!start: "starts"
@@ -832,7 +831,70 @@ def _make_parser_test(LEXER, PARSER):

x = g.parse("starts")
self.assertSequenceEqual(x.children, ['starts'])

def test_templates(self):
g = _Lark(r"""
start: "[" sep{NUMBER, ","} "]"
sep{item, delim}: item (delim item)*
NUMBER: /\d+/
%ignore " "
""")
x = g.parse("[1, 2, 3, 4]")
self.assertSequenceEqual(x.children, [Tree('sep', ['1', '2', '3', '4'])])
x = g.parse("[1]")
self.assertSequenceEqual(x.children, [Tree('sep', ['1'])])

def test_templates_recursion(self):
g = _Lark(r"""
start: "[" _sep{NUMBER, ","} "]"
_sep{item, delim}: item | _sep{item, delim} delim item
NUMBER: /\d+/
%ignore " "
""")
x = g.parse("[1, 2, 3, 4]")
self.assertSequenceEqual(x.children, ['1', '2', '3', '4'])
x = g.parse("[1]")
self.assertSequenceEqual(x.children, ['1'])

def test_templates_import(self):
g = _Lark_open("test_templates_import.lark", rel_to=__file__)
x = g.parse("[1, 2, 3, 4]")
self.assertSequenceEqual(x.children, [Tree('sep', ['1', '2', '3', '4'])])
x = g.parse("[1]")
self.assertSequenceEqual(x.children, [Tree('sep', ['1'])])

def test_templates_alias(self):
g = _Lark(r"""
start: expr{"C"}
expr{t}: "A" t
| "B" t -> b
""")
x = g.parse("AC")
self.assertSequenceEqual(x.children, [Tree('expr', [])])
x = g.parse("BC")
self.assertSequenceEqual(x.children, [Tree('b', [])])
def test_templates_modifiers(self):
g = _Lark(r"""
start: expr{"B"}
!expr{t}: "A" t
""")
x = g.parse("AB")
self.assertSequenceEqual(x.children, [Tree('expr', ["A", "B"])])
g = _Lark(r"""
start: _expr{"B"}
!_expr{t}: "A" t
""")
x = g.parse("AB")
self.assertSequenceEqual(x.children, ["A", "B"])
g = _Lark(r"""
start: expr{b}
b: "B"
?expr{t}: "A" t
""")
x = g.parse("AB")
self.assertSequenceEqual(x.children, [Tree('b',[])])

def test_g_regex_flags(self):
g = _Lark("""
start: "a" /b+/ C


+ 4
- 0
tests/test_templates_import.lark View File

@@ -0,0 +1,4 @@
start: "[" sep{NUMBER, ","} "]"
NUMBER: /\d+/
%ignore " "
%import .grammars.templates.sep

Loading…
Cancel
Save