Parcourir la source

Initial support for EBNF in tokens (automatic compilation to regexps)

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan il y a 7 ans
Parent
révision
c70355389f
9 fichiers modifiés avec 263 ajouts et 117 suppressions
  1. +5
    -3
      examples/calc.py
  2. +3
    -2
      examples/conf.py
  3. +1
    -1
      examples/json_parser.py
  4. +5
    -10
      lark/lark.py
  5. +4
    -3
      lark/lexer.py
  6. +212
    -97
      lark/load_grammar.py
  7. +1
    -0
      lark/parsers/lalr_parser.py
  8. +10
    -0
      lark/tree.py
  9. +22
    -1
      tests/test_parser.py

+ 5
- 3
examples/calc.py Voir le fichier

@@ -22,13 +22,15 @@ calc_grammar = """
| product "*" atom -> mul
| product "/" atom -> div

?atom: /[\d.]+/ -> number
?atom: NUMBER -> number
| "-" atom -> neg
| NAME -> var
| "(" sum ")"

NAME: /[a-zA-Z]\w*/
WS.ignore: /\s+/
NAME: /[a-zA-Z]\w+/ // Regexp form
NUMBER: ("0".."9"|".")+ // EBNF form (compiles to regexp)

%ignore " "|"\t"
"""

class CalculateTree(InlineTransformer):


+ 3
- 2
examples/conf.py Voir le fichier

@@ -22,9 +22,10 @@ parser = Lark(r"""
NAME: /[a-zA-Z_]\w*/
VALUE: /.*/

WS.ignore: /[\t \f]+/
COMMENT.ignore: /\#[^\n]*/
_NL: /(\r?\n)+/

%ignore /[\t \f]+/
%ignore /\#[^\n]*/
""", parser="lalr_contextual_lexer")




+ 1
- 1
examples/json_parser.py Voir le fichier

@@ -27,7 +27,7 @@ json_grammar = r"""
number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/
string : /".*?(?<!\\)"/

WS.ignore: /[ \t\n]+/
%ignore /[ \t\n]+/
"""

class TreeToJson(Transformer):


+ 5
- 10
lark/lark.py Voir le fichier

@@ -112,16 +112,11 @@ class Lark:
assert not self.options.profile, "Feature temporarily disabled"
self.profiler = Profiler() if self.options.profile else None

tokens, self.rules = load_grammar(grammar)
self.ignore_tokens = []
for tokendef, flags in tokens:
for flag in flags:
if flag == 'ignore':
self.ignore_tokens.append(tokendef.name)
else:
raise GrammarError("No such flag: %s" % flag)

self.lexer_conf = LexerConf([t[0] for t in tokens], self.ignore_tokens, self.options.postlex)
self.grammar = load_grammar(grammar)
tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=True)
self.ignore_tokens = self.grammar.extra['ignore']

self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex)

if not self.options.only_lex:
self.parser = self._build_parser()


+ 4
- 3
lark/lexer.py Voir le fichier

@@ -2,7 +2,7 @@

import re

from .utils import Str, classify
from .utils import Str, classify, STRING_TYPE
from .common import is_terminal

class LexError(Exception):
@@ -10,6 +10,7 @@ class LexError(Exception):

class TokenDef(object):
def __init__(self, name, value):
assert isinstance(value, STRING_TYPE), value
self.name = name
self.value = value

@@ -94,7 +95,7 @@ def _create_unless(tokens):

class Lexer(object):
def __init__(self, tokens, ignore=()):
assert all(isinstance(t, TokenDef) for t in tokens)
assert all(isinstance(t, TokenDef) for t in tokens), tokens

self.ignore = ignore
self.newline_char = '\n'
@@ -176,7 +177,7 @@ class ContextualLexer:
def __init__(self, tokens, states, ignore=(), always_accept=()):
tokens_by_name = {}
for t in tokens:
assert t.name not in tokens_by_name
assert t.name not in tokens_by_name, t
tokens_by_name[t.name] = t

lexer_by_tokens = {}


+ 212
- 97
lark/load_grammar.py Voir le fichier

@@ -1,3 +1,4 @@
from itertools import chain
import re
import codecs

@@ -61,20 +62,21 @@ TOKENS = {
'_COLON': ':',
'_OR': r'\|',
'_DOT': r'\.',
'_PERCENT': r'%',
'RULE': '!?[_?]?[a-z][_a-z0-9]*',
'TOKEN': '_?[A-Z][_A-Z0-9]*',
'STRING': r'".*?[^\\]"',
'REGEXP': r"/(?!/).*?[^\\]/",
'_NL': r'(\r?\n)+\s*',
'WS': r'[ \t]+',
'COMMENT': r'//[^\n]*\n',
'COMMENT': r'//[^\n]*',
'_TO': '->'
}

RULES = {
'start': ['list'],
'list': ['item', 'list item'],
'item': ['rule', 'token', '_NL'],
'start': ['_list'],
'_list': ['_item', '_list _item'],
'_item': ['rule', 'token', 'statement', '_NL'],

'rule': ['RULE _COLON expansions _NL'],
'expansions': ['alias',
@@ -93,17 +95,16 @@ RULES = {
'maybe',
'RULE',
'TOKEN',
'anontoken'],

'anontoken': ['tokenvalue'],
'tokenvalue',
'range'],

'maybe': ['_LBRA expansions _RBRA'],
'range': ['STRING _DOT _DOT STRING'],

'token': ['TOKEN _COLON tokenvalue _NL',
'TOKEN tokenmods _COLON tokenvalue _NL'],
'token': ['TOKEN _COLON expansions _NL'],
'statement': ['_PERCENT RULE expansions _NL'],

'?tokenvalue': ['REGEXP', 'STRING'],
'tokenmods': ['_DOT RULE', 'tokenmods _DOT RULE'],
'tokenvalue': ['REGEXP', 'STRING'],
}


@@ -218,29 +219,14 @@ class SimplifyTree(InlineTransformer):
tokenmods, value = args
return tokenmods + [value]

def get_tokens(tree, token_set):
for t in tree.find_data('token'):
x = t.children
name = x[0].value
assert not name.startswith('__'), 'Names starting with double-underscore are reserved (Error at %s)' % name
if name in token_set:
raise ValueError("Token '%s' defined more than once" % name)
token_set.add(name)

if len(x) == 2:
yield name, x[1], []
else:
assert len(x) == 3
yield name, x[2], x[1]

class ExtractAnonTokens(InlineTransformer):
def __init__(self, tokens, token_set):
self.tokens = tokens
self.token_set = token_set
self.token_reverse = {value[1:-1]: name for name, value, _flags in tokens}
self.token_reverse = {td.value: td.name for td in tokens}
self.i = 0

def anontoken(self, token):
def tokenvalue(self, token):
if token.type == 'STRING':
value = token.value[1:-1]
try:
@@ -270,13 +256,167 @@ class ExtractAnonTokens(InlineTransformer):

if token_name not in self.token_set:
self.token_set.add(token_name)
self.tokens.append((token_name, token, []))
if token.type == 'STRING':
self.tokens.append(TokenDef__Str(token_name, token[1:-1]))
else:
self.tokens.append(TokenDef__Regexp(token_name, token[1:-1]))
assert value not in self.token_reverse, value
self.token_reverse[value] = token_name

return Token('TOKEN', token_name, -1)


class TokenValue(object):
def __init__(self, value):
self.value = value

class TokenValue__Str(TokenValue):
def to_regexp(self):
return re.escape(self.value)

class TokenValue__Regexp(TokenValue):
def to_regexp(self):
return self.value

class TokenTreeToRegexp(Transformer):
def tokenvalue(self, tv):
tv ,= tv
value = tv.value[1:-1]

if r'\u' in value:
# XXX for now, you can't mix unicode escaping and unicode characters at the same token
value = unicode_escape(value)[0]

if tv.type == 'REGEXP':
return TokenValue__Regexp(value)
elif tv.type == 'STRING':
return TokenValue__Str(value)

assert False

def expansion(self, items):
if len(items) == 1:
return items[0]
return TokenValue__Regexp(''.join(i.to_regexp() for i in items))
def expansions(self, exps):
if len(exps) == 1:
return exps[0]
return TokenValue__Regexp('%s' % ('|'.join(i.to_regexp() for i in exps)))
def range(self, items):
assert all(i.type=='STRING' for i in items)
items = [i[1:-1] for i in items]
start, end = items
assert len(start) == len(end) == 1, (start, end)
return TokenValue__Regexp('[%s-%s]' % (start, end))

def expr(self, args):
inner, op = args
return TokenValue__Regexp('(?:%s)%s' % (inner.to_regexp(), op))

class Grammar:
def __init__(self, ruledefs, tokendefs, extra):
self.tokendefs = tokendefs
self.ruledefs = ruledefs
self.extra = extra

def compile(self, lexer=False):
assert lexer

tokendefs = [(name.value, t) for name, t in self.tokendefs]

ignore = []
for i, t in enumerate(self.extra['ignore']):
name = '__IGNORE_%d'%i
tokendefs.append((name, t))
ignore.append(name)
self.extra['ignore'] = ignore

# =================
# Compile Tokens
# =================
token_to_regexp = TokenTreeToRegexp()
token_dict = dict(tokendefs)
assert len(token_dict) == len(tokendefs), "Same name defined twice?"

# Resolve token assignments
while True:
changed = False
for name, token_tree in tokendefs:
for exp in chain(token_tree.find_data('expansion'), token_tree.find_data('expr')):
for i, item in enumerate(exp.children):
if isinstance(item, Token):
assert item.type != 'RULE', "Rules aren't allowed inside tokens"
if item.type == 'TOKEN':
exp.children[i] = token_dict[item]
changed = True
if not changed:
break

# Convert tokens to strings/regexps
tokens = []
for name, token_tree in tokendefs:
regexp = token_to_regexp.transform(token_tree)
if isinstance(regexp, TokenValue__Str):
tokendef = TokenDef__Str(name, regexp.value)
else:
tokendef = TokenDef__Regexp(name, regexp.to_regexp())
tokens.append(tokendef)

# Resolve regexp assignments of the form /..${X}../
# Not sure this is even important, since you can express most regexps with EBNF
# TODO a nicer implementation of this
token_dict = {td.name: td.to_regexp() for td in tokens}
while True:
changed = False
for t in tokens:
if isinstance(t, TokenDef__Regexp):
sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], t.value)
if sp:
value = ''.join(token_dict[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
for x in sp)
if value != t.value:
t.value = value
changed = True
if not changed:
break

# =================
# Compile Rules
# =================
extract_anon = ExtractAnonTokens(tokens, set(token_dict))
ebnf_to_bnf = EBNF_to_BNF()
simplify_rule = SimplifyRule_Visitor()
rule_tree_to_text = RuleTreeToText()
rules = {}

for name, rule_tree in self.ruledefs:
assert name not in rules
tree = extract_anon.transform(rule_tree) # Adds to tokens
rules[name] = ebnf_to_bnf.transform(tree)

dict_update_safe(rules, ebnf_to_bnf.new_rules)

for r in rules.values():
simplify_rule.visit(r)

rules = {origin: rule_tree_to_text.transform(tree) for origin, tree in rules.items()}

return tokens, rules, self.extra



class GrammarRule:
def __init__(self, name, expansions):
self.keep_all_tokens = name.startswith('!')
name = name.lstrip('!')
self.expand1 = name.startswith('?')
name = name.lstrip('?')

self.name = name
self.expansions = expansions




class GrammarLoader:
def __init__(self):
@@ -289,8 +429,6 @@ class GrammarLoader:
self.parser = LALR(lexer_conf, parser_conf)

self.simplify_tree = SimplifyTree()
self.simplify_rule = SimplifyRule_Visitor()
self.rule_tree_to_text = RuleTreeToText()

def load_grammar(self, grammar_text):
try:
@@ -306,74 +444,51 @@ class GrammarLoader:
raise GrammarError("Newline without starting a new option (Expecting '|') at line %s column %s" % (e.line, e.column))
raise

# =================
# Process Tokens
# =================

token_set = set()
tokens = list(get_tokens(tree, token_set))
extract_anon = ExtractAnonTokens(tokens, token_set)
tree = extract_anon.transform(tree) # Adds to tokens

token_ref = {}
tokendefs = []
for name, token, flags in tokens:
value = token.value[1:-1]
if r'\u' in value:
# XXX for now, you can't mix unicode escaping and unicode characters at the same token
value = unicode_escape(value)[0]

if token.type == 'REGEXP':
sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], value)
if sp:
value = ''.join(token_ref[x[2:-1]] if x.startswith('${') and x.endswith('}') else x
for x in sp)

token_ref[name] = value
tokendef = TokenDef__Regexp(name, value)
# Extract grammar items

token_defs = [c.children for c in tree.children if c.data=='token']
rule_defs = [c.children for c in tree.children if c.data=='rule']
statements = [c.children for c in tree.children if c.data=='statement']
assert len(token_defs) + len(rule_defs) + len(statements) == len(tree.children)

# Verify correctness
token_names = set()
for name, _ in token_defs:
if name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
if name in token_names:
raise GrammarError("Token '%s' defined more than once" % name)
token_names.add(name)

rules = [GrammarRule(name, x) for name, x in rule_defs]

rule_names = set()
for r in rules:
if r.name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
if r.name in rule_names:
raise GrammarError("Token '%s' defined more than once" % name)
rule_names.add(r.name)

for r in rules:
used_symbols = {t for x in r.expansions.find_data('expansion')
for t in x.scan_values(lambda t: t.type in ('RULE', 'TOKEN'))}
for sym in used_symbols:
if is_terminal(sym):
if sym not in token_names:
raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, r.name))
else:
if sym not in rule_names:
raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, r.name))

ignore = []
for command, expansions in statements:
if command == 'ignore':
ignore.append(expansions)
else:
assert token.type == 'STRING'
tokendef = TokenDef__Str(name, value)
assert False, command

tokendefs.append((tokendef, flags))

# =================
# Process Rules
# =================

ebnf_to_bnf = EBNF_to_BNF()

rules = {}
for rule in tree.find_data('rule'):
name, ebnf_tree = rule.children
name = name.value
if name in rules:
raise ValueError("Rule '%s' defined more than once" % name)

rules[name] = ebnf_to_bnf.transform(ebnf_tree)

dict_update_safe(rules, ebnf_to_bnf.new_rules)

for r in rules.values():
self.simplify_rule.visit(r)

rules = {origin: self.rule_tree_to_text.transform(tree) for origin, tree in rules.items()}

# ====================
# Verify correctness
# ====================
used_symbols = {symbol for expansions in rules.values()
for expansion, _alias in expansions
for symbol in expansion}
rule_set = {r.lstrip('!').lstrip('?') for r in rules}
for sym in used_symbols:
if is_terminal(sym):
if sym not in token_set:
raise GrammarError("Token '%s' used but not defined" % sym)
else:
if sym not in rule_set:
raise GrammarError("Rule '%s' used but not defined" % sym)
return Grammar(rule_defs, token_defs, {'ignore': ignore})

return tokendefs, rules

load_grammar = GrammarLoader().load_grammar

+ 1
- 0
lark/parsers/lalr_parser.py Voir le fichier

@@ -16,6 +16,7 @@ class Parser(object):

def parse(self, seq, set_state=None):
i = 0
token = None
stream = iter(seq)
states_idx = self.analysis.states_idx



+ 10
- 0
lark/tree.py Voir le fichier

@@ -48,6 +48,16 @@ class Tree(object):
def find_data(self, data):
return self.find_pred(lambda t: t.data == data)

def scan_values(self, pred):
for c in self.children:
if isinstance(c, Tree):
for t in c.scan_values(pred):
yield t
else:
if pred(c):
yield c


def __deepcopy__(self, memo):
return type(self)(self.data, deepcopy(self.children, memo))



+ 22
- 1
tests/test_parser.py Voir le fichier

@@ -269,7 +269,7 @@ def _make_parser_test(PARSER):
def test_token_collision(self):
g = _Lark("""start: "Hello" NAME
NAME: /\w+/
WS.ignore: /\s+/
%ignore " "
""")
x = g.parse('Hello World')
self.assertSequenceEqual(x.children, ['World'])
@@ -303,6 +303,14 @@ def _make_parser_test(PARSER):
""")
x = g.parse('aababc')

def test_token_embed(self):
g = _Lark("""start: A B C
A: "a"
B: A "b"
C: B "c"
""")
x = g.parse('aababc')

def test_token_not_anon(self):
"""Tests that "a" is matched as A, rather than an anonymous token.

@@ -334,6 +342,19 @@ def _make_parser_test(PARSER):
x = g.parse('a')
self.assertEqual(x.data, "b")

def test_token_ebnf(self):
g = _Lark("""start: A
A: "a"* ("b"? "c".."e")+
""")
x = g.parse('abcde')
x = g.parse('dd')

# def test_token_recurse(self):
# g = _Lark("""start: A
# A: B
# B: A
# """)

def test_lexer_token_limit(self):
"Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
tokens = {'A%d'%i:'"%d"'%i for i in range(300)}


Chargement…
Annuler
Enregistrer