Browse Source

Initial support for EBNF in tokens (automatic compilation to regexps)

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 7 years ago
parent
commit
c70355389f
9 changed files with 263 additions and 117 deletions
  1. +5
    -3
      examples/calc.py
  2. +3
    -2
      examples/conf.py
  3. +1
    -1
      examples/json_parser.py
  4. +5
    -10
      lark/lark.py
  5. +4
    -3
      lark/lexer.py
  6. +212
    -97
      lark/load_grammar.py
  7. +1
    -0
      lark/parsers/lalr_parser.py
  8. +10
    -0
      lark/tree.py
  9. +22
    -1
      tests/test_parser.py

+ 5
- 3
examples/calc.py View File

@@ -22,13 +22,15 @@ calc_grammar = """
| product "*" atom -> mul | product "*" atom -> mul
| product "/" atom -> div | product "/" atom -> div


?atom: /[\d.]+/ -> number
?atom: NUMBER -> number
| "-" atom -> neg | "-" atom -> neg
| NAME -> var | NAME -> var
| "(" sum ")" | "(" sum ")"


NAME: /[a-zA-Z]\w*/
WS.ignore: /\s+/
NAME: /[a-zA-Z]\w+/ // Regexp form
NUMBER: ("0".."9"|".")+ // EBNF form (compiles to regexp)

%ignore " "|"\t"
""" """


class CalculateTree(InlineTransformer): class CalculateTree(InlineTransformer):


+ 3
- 2
examples/conf.py View File

@@ -22,9 +22,10 @@ parser = Lark(r"""
NAME: /[a-zA-Z_]\w*/ NAME: /[a-zA-Z_]\w*/
VALUE: /.*/ VALUE: /.*/


WS.ignore: /[\t \f]+/
COMMENT.ignore: /\#[^\n]*/
_NL: /(\r?\n)+/ _NL: /(\r?\n)+/

%ignore /[\t \f]+/
%ignore /\#[^\n]*/
""", parser="lalr_contextual_lexer") """, parser="lalr_contextual_lexer")






+ 1
- 1
examples/json_parser.py View File

@@ -27,7 +27,7 @@ json_grammar = r"""
number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/ number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/
string : /".*?(?<!\\)"/ string : /".*?(?<!\\)"/


WS.ignore: /[ \t\n]+/
%ignore /[ \t\n]+/
""" """


class TreeToJson(Transformer): class TreeToJson(Transformer):


+ 5
- 10
lark/lark.py View File

@@ -112,16 +112,11 @@ class Lark:
assert not self.options.profile, "Feature temporarily disabled" assert not self.options.profile, "Feature temporarily disabled"
self.profiler = Profiler() if self.options.profile else None self.profiler = Profiler() if self.options.profile else None


tokens, self.rules = load_grammar(grammar)
self.ignore_tokens = []
for tokendef, flags in tokens:
for flag in flags:
if flag == 'ignore':
self.ignore_tokens.append(tokendef.name)
else:
raise GrammarError("No such flag: %s" % flag)

self.lexer_conf = LexerConf([t[0] for t in tokens], self.ignore_tokens, self.options.postlex)
self.grammar = load_grammar(grammar)
tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=True)
self.ignore_tokens = self.grammar.extra['ignore']

self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex)


if not self.options.only_lex: if not self.options.only_lex:
self.parser = self._build_parser() self.parser = self._build_parser()


+ 4
- 3
lark/lexer.py View File

@@ -2,7 +2,7 @@


import re import re


from .utils import Str, classify
from .utils import Str, classify, STRING_TYPE
from .common import is_terminal from .common import is_terminal


class LexError(Exception): class LexError(Exception):
@@ -10,6 +10,7 @@ class LexError(Exception):


class TokenDef(object): class TokenDef(object):
def __init__(self, name, value): def __init__(self, name, value):
assert isinstance(value, STRING_TYPE), value
self.name = name self.name = name
self.value = value self.value = value


@@ -94,7 +95,7 @@ def _create_unless(tokens):


class Lexer(object): class Lexer(object):
def __init__(self, tokens, ignore=()): def __init__(self, tokens, ignore=()):
assert all(isinstance(t, TokenDef) for t in tokens)
assert all(isinstance(t, TokenDef) for t in tokens), tokens


self.ignore = ignore self.ignore = ignore
self.newline_char = '\n' self.newline_char = '\n'
@@ -176,7 +177,7 @@ class ContextualLexer:
def __init__(self, tokens, states, ignore=(), always_accept=()): def __init__(self, tokens, states, ignore=(), always_accept=()):
tokens_by_name = {} tokens_by_name = {}
for t in tokens: for t in tokens:
assert t.name not in tokens_by_name
assert t.name not in tokens_by_name, t
tokens_by_name[t.name] = t tokens_by_name[t.name] = t


lexer_by_tokens = {} lexer_by_tokens = {}


+ 212
- 97
lark/load_grammar.py View File

@@ -1,3 +1,4 @@
from itertools import chain
import re import re
import codecs import codecs


@@ -61,20 +62,21 @@ TOKENS = {
'_COLON': ':', '_COLON': ':',
'_OR': r'\|', '_OR': r'\|',
'_DOT': r'\.', '_DOT': r'\.',
'_PERCENT': r'%',
'RULE': '!?[_?]?[a-z][_a-z0-9]*', 'RULE': '!?[_?]?[a-z][_a-z0-9]*',
'TOKEN': '_?[A-Z][_A-Z0-9]*', 'TOKEN': '_?[A-Z][_A-Z0-9]*',
'STRING': r'".*?[^\\]"', 'STRING': r'".*?[^\\]"',
'REGEXP': r"/(?!/).*?[^\\]/", 'REGEXP': r"/(?!/).*?[^\\]/",
'_NL': r'(\r?\n)+\s*', '_NL': r'(\r?\n)+\s*',
'WS': r'[ \t]+', 'WS': r'[ \t]+',
'COMMENT': r'//[^\n]*\n',
'COMMENT': r'//[^\n]*',
'_TO': '->' '_TO': '->'
} }


RULES = { RULES = {
'start': ['list'],
'list': ['item', 'list item'],
'item': ['rule', 'token', '_NL'],
'start': ['_list'],
'_list': ['_item', '_list _item'],
'_item': ['rule', 'token', 'statement', '_NL'],


'rule': ['RULE _COLON expansions _NL'], 'rule': ['RULE _COLON expansions _NL'],
'expansions': ['alias', 'expansions': ['alias',
@@ -93,17 +95,16 @@ RULES = {
'maybe', 'maybe',
'RULE', 'RULE',
'TOKEN', 'TOKEN',
'anontoken'],

'anontoken': ['tokenvalue'],
'tokenvalue',
'range'],


'maybe': ['_LBRA expansions _RBRA'], 'maybe': ['_LBRA expansions _RBRA'],
'range': ['STRING _DOT _DOT STRING'],


'token': ['TOKEN _COLON tokenvalue _NL',
'TOKEN tokenmods _COLON tokenvalue _NL'],
'token': ['TOKEN _COLON expansions _NL'],
'statement': ['_PERCENT RULE expansions _NL'],


'?tokenvalue': ['REGEXP', 'STRING'],
'tokenmods': ['_DOT RULE', 'tokenmods _DOT RULE'],
'tokenvalue': ['REGEXP', 'STRING'],
} }




@@ -218,29 +219,14 @@ class SimplifyTree(InlineTransformer):
tokenmods, value = args tokenmods, value = args
return tokenmods + [value] return tokenmods + [value]


def get_tokens(tree, token_set):
for t in tree.find_data('token'):
x = t.children
name = x[0].value
assert not name.startswith('__'), 'Names starting with double-underscore are reserved (Error at %s)' % name
if name in token_set:
raise ValueError("Token '%s' defined more than once" % name)
token_set.add(name)

if len(x) == 2:
yield name, x[1], []
else:
assert len(x) == 3
yield name, x[2], x[1]

class ExtractAnonTokens(InlineTransformer): class ExtractAnonTokens(InlineTransformer):
def __init__(self, tokens, token_set): def __init__(self, tokens, token_set):
self.tokens = tokens self.tokens = tokens
self.token_set = token_set self.token_set = token_set
self.token_reverse = {value[1:-1]: name for name, value, _flags in tokens}
self.token_reverse = {td.value: td.name for td in tokens}
self.i = 0 self.i = 0


def anontoken(self, token):
def tokenvalue(self, token):
if token.type == 'STRING': if token.type == 'STRING':
value = token.value[1:-1] value = token.value[1:-1]
try: try:
@@ -270,13 +256,167 @@ class ExtractAnonTokens(InlineTransformer):


if token_name not in self.token_set: if token_name not in self.token_set:
self.token_set.add(token_name) self.token_set.add(token_name)
self.tokens.append((token_name, token, []))
if token.type == 'STRING':
self.tokens.append(TokenDef__Str(token_name, token[1:-1]))
else:
self.tokens.append(TokenDef__Regexp(token_name, token[1:-1]))
assert value not in self.token_reverse, value assert value not in self.token_reverse, value
self.token_reverse[value] = token_name self.token_reverse[value] = token_name


return Token('TOKEN', token_name, -1) return Token('TOKEN', token_name, -1)




class TokenValue(object):
def __init__(self, value):
self.value = value

class TokenValue__Str(TokenValue):
def to_regexp(self):
return re.escape(self.value)

class TokenValue__Regexp(TokenValue):
def to_regexp(self):
return self.value

class TokenTreeToRegexp(Transformer):
def tokenvalue(self, tv):
tv ,= tv
value = tv.value[1:-1]

if r'\u' in value:
# XXX for now, you can't mix unicode escaping and unicode characters at the same token
value = unicode_escape(value)[0]

if tv.type == 'REGEXP':
return TokenValue__Regexp(value)
elif tv.type == 'STRING':
return TokenValue__Str(value)

assert False

def expansion(self, items):
if len(items) == 1:
return items[0]
return TokenValue__Regexp(''.join(i.to_regexp() for i in items))
def expansions(self, exps):
if len(exps) == 1:
return exps[0]
return TokenValue__Regexp('%s' % ('|'.join(i.to_regexp() for i in exps)))
def range(self, items):
assert all(i.type=='STRING' for i in items)
items = [i[1:-1] for i in items]
start, end = items
assert len(start) == len(end) == 1, (start, end)
return TokenValue__Regexp('[%s-%s]' % (start, end))

def expr(self, args):
inner, op = args
return TokenValue__Regexp('(?:%s)%s' % (inner.to_regexp(), op))

class Grammar:
def __init__(self, ruledefs, tokendefs, extra):
self.tokendefs = tokendefs
self.ruledefs = ruledefs
self.extra = extra

def compile(self, lexer=False):
assert lexer

tokendefs = [(name.value, t) for name, t in self.tokendefs]

ignore = []
for i, t in enumerate(self.extra['ignore']):
name = '__IGNORE_%d'%i
tokendefs.append((name, t))
ignore.append(name)
self.extra['ignore'] = ignore

# =================
# Compile Tokens
# =================
token_to_regexp = TokenTreeToRegexp()
token_dict = dict(tokendefs)
assert len(token_dict) == len(tokendefs), "Same name defined twice?"

# Resolve token assignments
while True:
changed = False
for name, token_tree in tokendefs:
for exp in chain(token_tree.find_data('expansion'), token_tree.find_data('expr')):
for i, item in enumerate(exp.children):
if isinstance(item, Token):
assert item.type != 'RULE', "Rules aren't allowed inside tokens"
if item.type == 'TOKEN':
exp.children[i] = token_dict[item]
changed = True
if not changed:
break

# Convert tokens to strings/regexps
tokens = []
for name, token_tree in tokendefs:
regexp = token_to_regexp.transform(token_tree)
if isinstance(regexp, TokenValue__Str):
tokendef = TokenDef__Str(name, regexp.value)
else:
tokendef = TokenDef__Regexp(name, regexp.to_regexp())
tokens.append(tokendef)

# Resolve regexp assignments of the form /..${X}../
# Not sure this is even important, since you can express most regexps with EBNF
# TODO a nicer implementation of this
token_dict = {td.name: td.to_regexp() for td in tokens}
while True:
changed = False
for t in tokens:
if isinstance(t, TokenDef__Regexp):
sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], t.value)
if sp:
value = ''.join(token_dict[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
for x in sp)
if value != t.value:
t.value = value
changed = True
if not changed:
break

# =================
# Compile Rules
# =================
extract_anon = ExtractAnonTokens(tokens, set(token_dict))
ebnf_to_bnf = EBNF_to_BNF()
simplify_rule = SimplifyRule_Visitor()
rule_tree_to_text = RuleTreeToText()
rules = {}

for name, rule_tree in self.ruledefs:
assert name not in rules
tree = extract_anon.transform(rule_tree) # Adds to tokens
rules[name] = ebnf_to_bnf.transform(tree)

dict_update_safe(rules, ebnf_to_bnf.new_rules)

for r in rules.values():
simplify_rule.visit(r)

rules = {origin: rule_tree_to_text.transform(tree) for origin, tree in rules.items()}

return tokens, rules, self.extra



class GrammarRule:
def __init__(self, name, expansions):
self.keep_all_tokens = name.startswith('!')
name = name.lstrip('!')
self.expand1 = name.startswith('?')
name = name.lstrip('?')

self.name = name
self.expansions = expansions





class GrammarLoader: class GrammarLoader:
def __init__(self): def __init__(self):
@@ -289,8 +429,6 @@ class GrammarLoader:
self.parser = LALR(lexer_conf, parser_conf) self.parser = LALR(lexer_conf, parser_conf)


self.simplify_tree = SimplifyTree() self.simplify_tree = SimplifyTree()
self.simplify_rule = SimplifyRule_Visitor()
self.rule_tree_to_text = RuleTreeToText()


def load_grammar(self, grammar_text): def load_grammar(self, grammar_text):
try: try:
@@ -306,74 +444,51 @@ class GrammarLoader:
raise GrammarError("Newline without starting a new option (Expecting '|') at line %s column %s" % (e.line, e.column)) raise GrammarError("Newline without starting a new option (Expecting '|') at line %s column %s" % (e.line, e.column))
raise raise


# =================
# Process Tokens
# =================

token_set = set()
tokens = list(get_tokens(tree, token_set))
extract_anon = ExtractAnonTokens(tokens, token_set)
tree = extract_anon.transform(tree) # Adds to tokens

token_ref = {}
tokendefs = []
for name, token, flags in tokens:
value = token.value[1:-1]
if r'\u' in value:
# XXX for now, you can't mix unicode escaping and unicode characters at the same token
value = unicode_escape(value)[0]

if token.type == 'REGEXP':
sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], value)
if sp:
value = ''.join(token_ref[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
for x in sp)

token_ref[name] = value
tokendef = TokenDef__Regexp(name, value)
# Extract grammar items

token_defs = [c.children for c in tree.children if c.data=='token']
rule_defs = [c.children for c in tree.children if c.data=='rule']
statements = [c.children for c in tree.children if c.data=='statement']
assert len(token_defs) + len(rule_defs) + len(statements) == len(tree.children)

# Verify correctness
token_names = set()
for name, _ in token_defs:
if name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
if name in token_names:
raise GrammarError("Token '%s' defined more than once" % name)
token_names.add(name)

rules = [GrammarRule(name, x) for name, x in rule_defs]

rule_names = set()
for r in rules:
if r.name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
if r.name in rule_names:
raise GrammarError("Token '%s' defined more than once" % name)
rule_names.add(r.name)

for r in rules:
used_symbols = {t for x in r.expansions.find_data('expansion')
for t in x.scan_values(lambda t: t.type in ('RULE', 'TOKEN'))}
for sym in used_symbols:
if is_terminal(sym):
if sym not in token_names:
raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, r.name))
else:
if sym not in rule_names:
raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, r.name))

ignore = []
for command, expansions in statements:
if command == 'ignore':
ignore.append(expansions)
else: else:
assert token.type == 'STRING'
tokendef = TokenDef__Str(name, value)
assert False, command


tokendefs.append((tokendef, flags))

# =================
# Process Rules
# =================

ebnf_to_bnf = EBNF_to_BNF()

rules = {}
for rule in tree.find_data('rule'):
name, ebnf_tree = rule.children
name = name.value
if name in rules:
raise ValueError("Rule '%s' defined more than once" % name)

rules[name] = ebnf_to_bnf.transform(ebnf_tree)

dict_update_safe(rules, ebnf_to_bnf.new_rules)

for r in rules.values():
self.simplify_rule.visit(r)

rules = {origin: self.rule_tree_to_text.transform(tree) for origin, tree in rules.items()}

# ====================
# Verify correctness
# ====================
used_symbols = {symbol for expansions in rules.values()
for expansion, _alias in expansions
for symbol in expansion}
rule_set = {r.lstrip('!').lstrip('?') for r in rules}
for sym in used_symbols:
if is_terminal(sym):
if sym not in token_set:
raise GrammarError("Token '%s' used but not defined" % sym)
else:
if sym not in rule_set:
raise GrammarError("Rule '%s' used but not defined" % sym)
return Grammar(rule_defs, token_defs, {'ignore': ignore})


return tokendefs, rules


load_grammar = GrammarLoader().load_grammar load_grammar = GrammarLoader().load_grammar

+ 1
- 0
lark/parsers/lalr_parser.py View File

@@ -16,6 +16,7 @@ class Parser(object):


def parse(self, seq, set_state=None): def parse(self, seq, set_state=None):
i = 0 i = 0
token = None
stream = iter(seq) stream = iter(seq)
states_idx = self.analysis.states_idx states_idx = self.analysis.states_idx




+ 10
- 0
lark/tree.py View File

@@ -48,6 +48,16 @@ class Tree(object):
def find_data(self, data): def find_data(self, data):
return self.find_pred(lambda t: t.data == data) return self.find_pred(lambda t: t.data == data)


def scan_values(self, pred):
for c in self.children:
if isinstance(c, Tree):
for t in c.scan_values(pred):
yield t
else:
if pred(c):
yield c


def __deepcopy__(self, memo): def __deepcopy__(self, memo):
return type(self)(self.data, deepcopy(self.children, memo)) return type(self)(self.data, deepcopy(self.children, memo))




+ 22
- 1
tests/test_parser.py View File

@@ -269,7 +269,7 @@ def _make_parser_test(PARSER):
def test_token_collision(self): def test_token_collision(self):
g = _Lark("""start: "Hello" NAME g = _Lark("""start: "Hello" NAME
NAME: /\w+/ NAME: /\w+/
WS.ignore: /\s+/
%ignore " "
""") """)
x = g.parse('Hello World') x = g.parse('Hello World')
self.assertSequenceEqual(x.children, ['World']) self.assertSequenceEqual(x.children, ['World'])
@@ -303,6 +303,14 @@ def _make_parser_test(PARSER):
""") """)
x = g.parse('aababc') x = g.parse('aababc')


def test_token_embed(self):
g = _Lark("""start: A B C
A: "a"
B: A "b"
C: B "c"
""")
x = g.parse('aababc')

def test_token_not_anon(self): def test_token_not_anon(self):
"""Tests that "a" is matched as A, rather than an anonymous token. """Tests that "a" is matched as A, rather than an anonymous token.


@@ -334,6 +342,19 @@ def _make_parser_test(PARSER):
x = g.parse('a') x = g.parse('a')
self.assertEqual(x.data, "b") self.assertEqual(x.data, "b")


def test_token_ebnf(self):
g = _Lark("""start: A
A: "a"* ("b"? "c".."e")+
""")
x = g.parse('abcde')
x = g.parse('dd')

# def test_token_recurse(self):
# g = _Lark("""start: A
# A: B
# B: A
# """)

def test_lexer_token_limit(self): def test_lexer_token_limit(self):
"Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation" "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
tokens = {'A%d'%i:'"%d"'%i for i in range(300)} tokens = {'A%d'%i:'"%d"'%i for i in range(300)}


Loading…
Cancel
Save