Quellcode durchsuchen

Scanless Earley now working for all tests!

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan vor 7 Jahren
Ursprung
Commit
a60c339ff7
10 geänderte Dateien mit 216 neuen und 140 gelöschten Zeilen
  1. +6
    -2
      examples/json_parser.py
  2. +2
    -1
      lark/grammars/common.g
  3. +2
    -2
      lark/lark.py
  4. +5
    -7
      lark/lexer.py
  5. +117
    -85
      lark/load_grammar.py
  6. +21
    -25
      lark/parse_tree_builder.py
  7. +36
    -7
      lark/parser_frontends.py
  8. +4
    -4
      lark/parsers/earley.py
  9. +1
    -1
      tests/__main__.py
  10. +22
    -6
      tests/test_parser.py

+ 6
- 2
examples/json_parser.py Datei anzeigen

@@ -36,7 +36,7 @@ json_grammar = r"""
class TreeToJson(Transformer):
@inline_args
def string(self, s):
return s[1:-1]
return s[1:-1].replace('\\"', '"')

array = list
pair = tuple
@@ -47,6 +47,10 @@ class TreeToJson(Transformer):
true = lambda self, _: True
false = lambda self, _: False

# json_parser = Lark(json_grammar, parser='earley')
# def parse(x):
# return TreeToJson().transform(json_parser.parse(x))

json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson())
parse = json_parser.parse

@@ -57,7 +61,7 @@ def test():
"empty_array" : [],
"booleans" : { "YES" : true, "NO" : false },
"numbers" : [ 0, 1, -2, 3.3, 4.4e5, 6.6e-7 ],
"strings" : [ "This", [ "And" , "That" ] ],
"strings" : [ "This", [ "And" , "That", "And a \\"b" ] ],
"nothing" : null
}
'''


+ 2
- 1
lark/grammars/common.g Datei anzeigen

@@ -19,7 +19,8 @@ SIGNED_NUMBER: ["+"|"-"] NUMBER
//
// Strings
//
ESCAPED_STRING: /".*?(?<!\\\\)"/
STRING_INNER: ("\\\""|/[^"]/)
ESCAPED_STRING: "\"" STRING_INNER* "\""


//


+ 2
- 2
lark/lark.py Datei anzeigen

@@ -127,8 +127,8 @@ class Lark:
lexer = 'standard'
self.options.lexer = lexer

self.grammar = load_grammar(grammar)
tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=bool(lexer))
self.grammar = load_grammar(grammar, source)
tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=bool(lexer), start=self.options.start)
self.ignore_tokens = self.grammar.extra['ignore']

self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex)


+ 5
- 7
lark/lexer.py Datei anzeigen

@@ -20,11 +20,13 @@ class UnexpectedInput(LexError):
self.context = context

class Token(Str):
def __new__(cls, type_, value, pos_in_stream=None):
def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None):
inst = Str.__new__(cls, value)
inst.type = type_
inst.pos_in_stream = pos_in_stream
inst.value = value
inst.line = line
inst.column = column
return inst

@classmethod
@@ -134,9 +136,7 @@ class Lexer(object):
value = m.group(0)
type_ = type_from_index[m.lastindex]
if type_ not in ignore_types:
t = Token(type_, value, lex_pos)
t.line = line
t.column = lex_pos - col_start_pos
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
if t.type in self.callback:
t = self.callback[t.type](t)
yield t
@@ -198,9 +198,7 @@ class ContextualLexer:
value = m.group(0)
type_ = type_from_index[m.lastindex]
if type_ not in ignore_types:
t = Token(type_, value, lex_pos)
t.line = line
t.column = lex_pos - col_start_pos
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
if t.type in lexer.callback:
t = lexer.callback[t.type](t)
yield t


+ 117
- 85
lark/load_grammar.py Datei anzeigen

@@ -1,7 +1,6 @@
import os.path
from itertools import chain
import re
import codecs
from ast import literal_eval

from .lexer import Lexer, Token, UnexpectedInput
@@ -13,12 +12,9 @@ from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr

from .tree import Tree as T, Transformer, InlineTransformer, Visitor

unicode_escape = codecs.getdecoder('unicode_escape')

__path__ = os.path.dirname(__file__)
IMPORT_PATHS = [os.path.join(__path__, 'grammars')]


_TOKEN_NAMES = {
'.' : 'DOT',
',' : 'COMMA',
@@ -64,7 +60,7 @@ TOKENS = {
'_RPAR': r'\)',
'_LBRA': r'\[',
'_RBRA': r'\]',
'OP': '[+*?](?![a-z])',
'OP': '[+*][?]?|[?](?![a-z])',
'_COLON': ':',
'_OR': r'\|',
'_DOT': r'\.',
@@ -101,7 +97,7 @@ RULES = {
'?atom': ['_LPAR expansions _RPAR',
'maybe',
'name',
'tokenvalue',
'literal',
'range'],

'?name': ['RULE', 'TOKEN'],
@@ -117,7 +113,7 @@ RULES = {
'import_args': ['_import_args'],
'_import_args': ['name', '_import_args _DOT name'],

'tokenvalue': ['REGEXP', 'STRING'],
'literal': ['REGEXP', 'STRING'],
}


@@ -127,6 +123,7 @@ class EBNF_to_BNF(InlineTransformer):
self.rules_by_expr = {}
self.prefix = 'anon'
self.i = 0
self.rule_options = None

def _add_recurse_rule(self, type_, expr):
if expr in self.rules_by_expr:
@@ -135,7 +132,7 @@ class EBNF_to_BNF(InlineTransformer):
new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
self.i += 1
t = Token('RULE', new_name, -1)
self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]), self.rule_options
self.rules_by_expr[expr] = t
return t

@@ -240,18 +237,10 @@ class ExtractAnonTokens(InlineTransformer):
self.re_reverse = {td.pattern.value: td.name for td in tokens if isinstance(td.pattern, PatternRE)}
self.i = 0

def range(self, start, end):
assert start.type == end.type == 'STRING'
start = start.value[1:-1]
end = end.value[1:-1]
assert len(start) == len(end) == 1
regexp = '/[%s-%s]/' % (start, end)
t = Token('REGEXP', regexp)
return self.tokenvalue(t)

def tokenvalue(self, token):
value = token.value[1:-1]
if token.type == 'STRING':
def pattern(self, p):
value = p.value
if isinstance(p, PatternStr):
try:
# If already defined, use the user-defined token name
token_name = self.str_reverse[value]
@@ -261,49 +250,72 @@ class ExtractAnonTokens(InlineTransformer):
token_name = _TOKEN_NAMES[value]
except KeyError:
if value.isalnum() and value[0].isalpha() and ('__'+value.upper()) not in self.token_set:
token_name = value.upper() # This can create name duplications for unidentical tokens
token_name = '%s%d' % (value.upper(), self.i)
try:
# Make sure we don't have unicode in our token names
token_name.encode('ascii')
except UnicodeEncodeError:
token_name = 'ANONSTR_%d' % self.i
else:
token_name = 'ANONSTR_%d' % self.i
self.i += 1
self.i += 1

token_name = '__' + token_name

elif token.type == 'REGEXP':
elif isinstance(p, PatternRE):
if value in self.re_reverse: # Kind of a wierd placement
token_name = self.re_reverse[value]
else:
token_name = 'ANONRE_%d' % self.i
self.i += 1
else:
assert False, token
assert False, p

if token_name not in self.token_set:
self.token_set.add(token_name)

if token.type == 'STRING':
if isinstance(p, PatternStr):
assert value not in self.str_reverse
self.str_reverse[value] = token_name
else:
assert value not in self.re_reverse
self.re_reverse[value] = token_name

pattern = _tokenvalue_to_pattern(token)
self.tokens.append(TokenDef(token_name, pattern))
self.tokens.append(TokenDef(token_name, p))

return Token('TOKEN', token_name, -1)


def _tokenvalue_to_pattern(tokenvalue):
v = tokenvalue.value
def _literal_to_pattern(literal):
v = literal.value
assert v[0] == v[-1] and v[0] in '"/'
s = literal_eval("u'''%s'''" % v[1:-1])
return { 'STRING': PatternStr,
'REGEXP': PatternRE }[tokenvalue.type](s)
'REGEXP': PatternRE }[literal.type](s)


class PrepareLiterals(InlineTransformer):
def literal(self, literal):
return T('pattern', [_literal_to_pattern(literal)])

def range(self, start, end):
assert start.type == end.type == 'STRING'
start = start.value[1:-1]
end = end.value[1:-1]
assert len(start) == len(end) == 1
regexp = '[%s-%s]' % (start, end)
return T('pattern', [PatternRE(regexp)])

class SplitLiterals(InlineTransformer):
def pattern(self, p):
if isinstance(p, PatternStr) and len(p.value)>1:
return T('expansion', [T('pattern', [PatternStr(ch)]) for ch in p.value])
return T('pattern', [p])

class TokenTreeToPattern(Transformer):
def tokenvalue(self, tv):
tv ,= tv
return _tokenvalue_to_pattern(tv)
def pattern(self, ps):
p ,= ps
return p

def expansion(self, items):
if len(items) == 1:
@@ -313,38 +325,51 @@ class TokenTreeToPattern(Transformer):
if len(exps) == 1:
return exps[0]
return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)))
def range(self, items):
assert all(i.type=='STRING' for i in items)
items = [i[1:-1] for i in items]
start, end = items
assert len(start) == len(end) == 1, (start, end)
return PatternRE('[%s-%s]' % (start, end))

def expr(self, args):
inner, op = args
return PatternRE('(?:%s)%s' % (inner.to_regexp(), op))


def interleave(l, item):
for e in l:
yield e
if isinstance(e, T):
if e.data == 'literal':
yield item
elif is_terminal(e):
yield item

class Grammar:
def __init__(self, rule_defs, token_defs, extra):
self.token_defs = token_defs
self.rule_defs = rule_defs
self.extra = extra

def compile(self, lexer=False):
# assert lexer
def compile(self, lexer=False, start=None):
if not lexer:
self.rule_defs += self.token_defs
self.token_defs = []

for name, tree in self.rule_defs:
for tokenvalue in tree.find_data('tokenvalue'):
value ,= tokenvalue.children
if value.type == 'STRING':
assert value[0] == value[-1] == '"'
if len(value)>3:
tokenvalue.data = 'expansion'
tokenvalue.children = [T('tokenvalue', [Token('STRING', '"%s"'%ch)]) for ch in value[1:-1]]
tokendefs = list(self.token_defs)
# XXX VERY HACKY!! There must be a better way..
ignore_tokens = [('_'+name, t) for name, t in self.token_defs if name in self.extra['ignore']]
if ignore_tokens:
self.token_defs = [('_'+name if name in self.extra['ignore'] else name,t) for name,t in self.token_defs]
ignore_names = [t[0] for t in ignore_tokens]
expr = Token('RULE', '__ignore')
for r, tree, _o in self.rule_defs:
for exp in tree.find_data('expansion'):
exp.children = list(interleave(exp.children, expr))
if r == start: # TODO use GrammarRule or similar (RuleOptions?)
exp.children = [expr] + exp.children

x = [T('expansion', [Token('RULE', x)]) for x in ignore_names]
_ignore_tree = T('expr', [T('expansions', x), Token('OP', '?')])
self.rule_defs.append(('__ignore', _ignore_tree, None))

for name, tree in self.token_defs:
self.rule_defs.append((name, tree, RuleOptions(keep_all_tokens=True)))

token_defs = []
else:
token_defs = list(self.token_defs)

# =================
# Compile Tokens
@@ -353,7 +378,8 @@ class Grammar:

# Convert tokens to strings/regexps
tokens = []
for name, token_tree in tokendefs:
for name, token_tree in token_defs:
token_tree = PrepareLiterals().transform(token_tree)
pattern = token_tree_to_pattern.transform(token_tree)
tokens.append(TokenDef(name, pattern) )

@@ -384,31 +410,38 @@ class Grammar:
rule_tree_to_text = RuleTreeToText()
rules = {}

for name, rule_tree in self.rule_defs:
assert name not in rules
for name, rule_tree, options in self.rule_defs:
assert name not in rules, name
rule_tree = PrepareLiterals().transform(rule_tree)
if not lexer:
rule_tree = SplitLiterals().transform(rule_tree)
tree = extract_anon.transform(rule_tree) # Adds to tokens
rules[name] = ebnf_to_bnf.transform(tree)
ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
rules[name] = ebnf_to_bnf.transform(tree), options

dict_update_safe(rules, ebnf_to_bnf.new_rules)

for r in rules.values():
simplify_rule.visit(r)
for tree, _o in rules.values():
simplify_rule.visit(tree)

rules = {origin: rule_tree_to_text.transform(tree) for origin, tree in rules.items()}
rules = {origin: (rule_tree_to_text.transform(tree), options) for origin, (tree, options) in rules.items()}

return tokens, rules, self.extra



class GrammarRule:
def __init__(self, name, expansions):
self.keep_all_tokens = name.startswith('!')
name = name.lstrip('!')
self.expand1 = name.startswith('?')
name = name.lstrip('?')
class RuleOptions:
def __init__(self, keep_all_tokens=False, expand1=False):
self.keep_all_tokens = keep_all_tokens
self.expand1 = expand1

self.name = name
self.expansions = expansions
def _extract_options_for_rule(name, expansions):
keep_all_tokens = name.startswith('!')
name = name.lstrip('!')
expand1 = name.startswith('?')
name = name.lstrip('?')

return name, expansions, RuleOptions(keep_all_tokens, expand1)



@@ -418,7 +451,7 @@ def import_grammar(grammar_path):
for import_path in IMPORT_PATHS:
with open(os.path.join(import_path, grammar_path)) as f:
text = f.read()
grammar = load_grammar(text)
grammar = load_grammar(text, grammar_path)
_imported_grammars[grammar_path] = grammar

return _imported_grammars[grammar_path]
@@ -447,7 +480,8 @@ class GrammarLoader:
def __init__(self):
tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()]

d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()}
rules = [_extract_options_for_rule(name, x) for name, x in RULES.items()]
d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules}
rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None)
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'], None)
parser_conf = ParserConf(rules, callback, 'start')
@@ -455,17 +489,15 @@ class GrammarLoader:

self.simplify_tree = SimplifyTree()

def load_grammar(self, grammar_text):
# for x in self.parser.lex(grammar_text):
# print (x)
def load_grammar(self, grammar_text, name='<?>'):
try:
tree = self.simplify_tree.transform( self.parser.parse(grammar_text+'\n') )
except UnexpectedInput as e:
raise GrammarError("Unexpected input %r at line %d column %d" % (e.context, e.line, e.column))
raise GrammarError("Unexpected input %r at line %d column %d in %s" % (e.context, e.line, e.column, name))
except UnexpectedToken as e:
if '_COLON' in e.expected:
raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column))
elif 'tokenvalue' in e.expected:
elif 'literal' in e.expected:
raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column))
elif e.expected == ['_OR']:
raise GrammarError("Newline without starting a new option (Expecting '|') at line %s column %s" % (e.line, e.column))
@@ -528,30 +560,30 @@ class GrammarLoader:
raise GrammarError("Token '%s' defined more than once" % name)
token_names.add(name)

rules = [GrammarRule(name, x) for name, x in rule_defs]
rules = [_extract_options_for_rule(name, x) for name, x in rule_defs]

rule_names = set()
for r in rules:
if r.name.startswith('__'):
for name, _x, _o in rules:
if name.startswith('__'):
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)
if r.name in rule_names:
raise GrammarError("Rule '%s' defined more than once" % r.name)
rule_names.add(r.name)
if name in rule_names:
raise GrammarError("Rule '%s' defined more than once" % name)
rule_names.add(name)

for r in rules:
used_symbols = {t for x in r.expansions.find_data('expansion')
for name, expansions, _o in rules:
used_symbols = {t for x in expansions.find_data('expansion')
for t in x.scan_values(lambda t: t.type in ('RULE', 'TOKEN'))}
for sym in used_symbols:
if is_terminal(sym):
if sym not in token_names:
raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, r.name))
raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name))
else:
if sym not in rule_names:
raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, r.name))
raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name))

# TODO don't include unused tokens, they can only cause trouble!

return Grammar(rule_defs, token_defs, {'ignore': ignore_names})
return Grammar(rules, token_defs, {'ignore': ignore_names})





+ 21
- 25
lark/parse_tree_builder.py Datei anzeigen

@@ -13,21 +13,22 @@ def create_expand1_tree_builder_function(tree_builder):
return expand1

def create_rule_handler(expansion, usermethod, keep_all_tokens):
if not keep_all_tokens:
to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion)
if not (is_terminal(sym) and sym.startswith('_'))]

if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include):
def _build_ast(match):
children = []
for i, to_expand in to_include:
if to_expand:
children += match[i].children
else:
children.append(match[i])

return usermethod(children)
return _build_ast
# if not keep_all_tokens:
to_include = [(i, not is_terminal(sym) and sym.startswith('_'))
for i, sym in enumerate(expansion)
if keep_all_tokens or not is_terminal(sym) or not sym.startswith('_')]

if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include):
def _build_ast(match):
children = []
for i, to_expand in to_include:
if to_expand:
children += match[i].children
else:
children.append(match[i])

return usermethod(children)
return _build_ast

# else, if no filtering required..
return usermethod
@@ -48,21 +49,16 @@ class ParseTreeBuilder:
def create_tree_builder(self, rules, transformer):
callback = Callback()
new_rules = []
for origin, expansions in rules.items():
keep_all_tokens = False
if origin.startswith('!'):
origin=origin.lstrip('!')
keep_all_tokens = True
for origin, (expansions, options) in rules.items():
keep_all_tokens = options.keep_all_tokens if options else False
expand1 = options.expand1 if options else False

expand1 = origin.startswith('?')
_origin = origin.lstrip('?')
_origin = origin

for expansion, alias in expansions:
if alias and origin.startswith('_'):
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases" % origin)
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias))

if alias:
alias = alias.lstrip('*')
_alias = 'autoalias_%s_%s' % (_origin, '_'.join(expansion))

try:


+ 36
- 7
lark/parser_frontends.py Datei anzeigen

@@ -92,21 +92,39 @@ class Earley(WithLexer):
return res[0]


def tokenize_text(text):
new_text = []
line = 1
col_start_pos = 0
for i, ch in enumerate(text):
if '\n' in ch:
line += ch.count('\n')
col_start_pos = i + ch.rindex('\n')
new_text.append(Token('CHAR', ch, line=line, column=i - col_start_pos))
return new_text

class Nearley_NoLex:
def __init__(self, lexer_conf, parser_conf):
self.tokens_to_convert = {name: '__token_'+name for name, tree, _ in parser_conf.rules if is_terminal(name)}
rules = []
for name, exp, alias in parser_conf.rules:
name = self.tokens_to_convert.get(name, name)
exp = [self.tokens_to_convert.get(x, x) for x in exp]
rules.append((name, exp, alias))

self.token_by_name = {t.name:t for t in lexer_conf.tokens}

rules = [{'name':n,
'symbols': list(self._prepare_expansion(x)),
'postprocess': getattr(parser_conf.callback, a)}
for n,x,a in parser_conf.rules]
for n,x,a in rules]

self.parser = nearley.Parser(rules, parser_conf.start)

def _prepare_expansion(self, expansion):
for sym in expansion:
if is_terminal(sym):
regexp = self.token_by_name[sym].to_regexp()
regexp = self.token_by_name[sym].pattern.to_regexp()
width = sre_parse.parse(regexp).getwidth()
if not width == (1,1):
raise GrammarError('Dynamic lexing requires all tokens to have a width of 1 (%s is %s)' % (regexp, width))
@@ -115,9 +133,19 @@ class Nearley_NoLex:
yield sym

def parse(self, text):
res = self.parser.parse(text)
new_text = tokenize_text(text)
res = self.parser.parse(new_text)
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
return res[0]
res = res[0]

class RestoreTokens(Transformer):
pass

for t in self.tokens_to_convert:
setattr(RestoreTokens, t, ''.join)

res = RestoreTokens().transform(res)
return res


class Earley_NoLex:
@@ -141,13 +169,14 @@ class Earley_NoLex:
regexp = self.token_by_name[sym].pattern.to_regexp()
width = sre_parse.parse(regexp).getwidth()
if not width == (1,1):
raise GrammarError('Dynamic lexing requires all tokens to have a width of 1 (%s is %s)' % (regexp, width))
yield (re.compile(regexp).match,)
raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width))
yield (re.compile(regexp).match, regexp)
else:
yield sym

def parse(self, text):
res = self.parser.parse(text)
new_text = tokenize_text(text)
res = self.parser.parse(new_text)
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
res = res[0]



+ 4
- 4
lark/parsers/earley.py Datei anzeigen

@@ -76,18 +76,18 @@ class Column:
for item in items:

if item.is_complete:
if item in added:
continue
# if item in added: # XXX This causes a bug with empty rules
# continue # And might be unnecessary
# added.add(item)
self.to_reduce.append(item)
added.add(item)
else:
if is_terminal(item.expect):
self.to_scan.append(item)
else:
if item in added:
continue
self.to_predict.append(item)
added.add(item)
self.to_predict.append(item)

self.item_count += 1 # Only count if actually added



+ 1
- 1
tests/__main__.py Datei anzeigen

@@ -5,7 +5,7 @@ import logging

from .test_trees import TestTrees
# from .test_selectors import TestSelectors
from .test_parser import TestLalrStandard, TestEarleyStandard, TestLalrContextual, TestParsers
from .test_parser import TestLalrStandard, TestEarleyStandard, TestLalrContextual, TestParsers, TestEarleyScanless
# from .test_grammars import TestPythonG, TestConfigG

logging.basicConfig(level=logging.INFO)


+ 22
- 6
tests/test_parser.py Datei anzeigen

@@ -49,7 +49,14 @@ class TestParsers(unittest.TestCase):


class TestEarley(unittest.TestCase):
pass
def test_anon_in_scanless(self):
# Fails an Earley implementation without special handling for empty rules,
# or re-processing of already completed rules.
g = Lark(r"""start: B
B: ("ab"|/[^b]/)*
""", lexer=None)

assertEqual( g.parse('abc'), 'abc')


def _make_parser_test(LEXER, PARSER):
@@ -98,6 +105,7 @@ def _make_parser_test(LEXER, PARSER):
""")
g.parse(u'\xa3\u0101\u00a3')

@unittest.skipIf(LEXER is None, "Regexps >1 not supported with scanless parsing")
def test_unicode2(self):
g = _Lark(r"""start: UNIA UNIB UNIA UNIC
UNIA: /\xa3/
@@ -106,6 +114,14 @@ def _make_parser_test(LEXER, PARSER):
""")
g.parse(u'\xa3a\u0101b\\ \u00a3\u0101c\n')

def test_unicode3(self):
g = _Lark(r"""start: UNIA UNIB UNIA UNIC
UNIA: /\xa3/
UNIB: "\u0101"
UNIC: /\u0203/ /\n/
""")
g.parse(u'\xa3\u0101\u00a3\u0203\n')


def test_recurse_expansion(self):
"""Verify that stack depth doesn't get exceeded on recursive rules marked for expansion."""
@@ -279,7 +295,7 @@ def _make_parser_test(LEXER, PARSER):

def test_token_collision(self):
g = _Lark("""start: "Hello" NAME
NAME: /\w+/
NAME: /\w/+
%ignore " "
""")
x = g.parse('Hello World')
@@ -320,6 +336,7 @@ def _make_parser_test(LEXER, PARSER):
x = g.parse('aaaab')
x = g.parse('b')

@unittest.skipIf(LEXER is None, "Regexps >1 not supported with scanless parsing")
def test_regex_embed(self):
g = _Lark("""start: A B C
A: /a/
@@ -413,9 +430,7 @@ def _make_parser_test(LEXER, PARSER):
# or re-processing of already completed rules.
g = _Lark(r"""start: _empty a "B"
a: _empty "A"
_empty: _empty2
_empty2: _empty3
_empty3:
_empty:
""")
x = g.parse('AB')

@@ -437,7 +452,7 @@ def _make_parser_test(LEXER, PARSER):
g.parse("+2e-9")
self.assertRaises(ParseError, g.parse, "+2e-9e")

_NAME = "Test" + PARSER.capitalize() + (LEXER or 'None').capitalize()
_NAME = "Test" + PARSER.capitalize() + (LEXER or 'Scanless').capitalize()
_TestParser.__name__ = _NAME
globals()[_NAME] = _TestParser

@@ -445,6 +460,7 @@ _TO_TEST = [
('standard', 'earley'),
('standard', 'lalr'),
('contextual', 'lalr'),
(None, 'earley'),
]

for LEXER, PARSER in _TO_TEST:


Laden…
Abbrechen
Speichern