Browse Source

Final fixes for scanless (hopefully)

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 8 years ago
parent
commit
5c115acbd9
6 changed files with 96 additions and 37 deletions
  1. +3
    -0
      lark/lexer.py
  2. +39
    -5
      lark/load_grammar.py
  3. +27
    -8
      lark/parse_tree_builder.py
  4. +2
    -18
      lark/parser_frontends.py
  5. +15
    -4
      lark/parsers/earley.py
  6. +10
    -2
      lark/reconstruct.py

+ 3
- 0
lark/lexer.py View File

@@ -40,6 +40,9 @@ class Token(Str):
def __repr__(self):
return 'Token(%s, %r)' % (self.type, self.value)

def __deepcopy__(self, memo):
return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)

class Regex:
def __init__(self, pattern, flags=()):
self.pattern = pattern


+ 39
- 5
lark/load_grammar.py View File

@@ -2,6 +2,7 @@ import os.path
from itertools import chain
import re
from ast import literal_eval
from copy import deepcopy

from .lexer import Token, UnexpectedInput

@@ -348,13 +349,15 @@ class Grammar:

def compile(self, lexer=False, start=None):
if not lexer:
rule_defs = deepcopy(self.rule_defs)

# XXX VERY HACKY!! There must be a better way..
ignore_tokens = [('_'+name, t) for name, t in self.token_defs if name in self.extra['ignore']]
if ignore_tokens:
self.token_defs = [('_'+name if name in self.extra['ignore'] else name,t) for name,t in self.token_defs]
ignore_names = [t[0] for t in ignore_tokens]
expr = Token('RULE', '__ignore')
for r, tree, _o in self.rule_defs:
for r, tree, _o in rule_defs:
for exp in tree.find_data('expansion'):
exp.children = list(interleave(exp.children, expr))
if r == start: # TODO use GrammarRule or similar (RuleOptions?)
@@ -362,14 +365,34 @@ class Grammar:

x = [T('expansion', [Token('RULE', x)]) for x in ignore_names]
_ignore_tree = T('expr', [T('expansions', x), Token('OP', '?')])
self.rule_defs.append(('__ignore', _ignore_tree, None))
rule_defs.append(('__ignore', _ignore_tree, None))
# End of "ignore" section

for name, tree in self.token_defs:
self.rule_defs.append((name, tree, RuleOptions(keep_all_tokens=True)))
rule_defs.append((name, tree, RuleOptions(keep_all_tokens=True)))

token_defs = []

tokens_to_convert = {name: '__token_'+name for name, tree, _ in rule_defs if is_terminal(name)}
new_rule_defs = []
for name, tree, options in rule_defs:
if name in tokens_to_convert:
if name.startswith('_'):
options = RuleOptions.new_from(options, filter_out=True)
else:
options = RuleOptions.new_from(options, join_children=True)

name = tokens_to_convert.get(name, name)
for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ):
for i, sym in enumerate(exp.children):
if sym in tokens_to_convert:
exp.children[i] = Token(sym.type, tokens_to_convert[sym])
new_rule_defs.append((name, tree, options))

rule_defs = new_rule_defs
else:
token_defs = list(self.token_defs)
rule_defs = self.rule_defs

# =================
# Compile Tokens
@@ -410,7 +433,7 @@ class Grammar:
rule_tree_to_text = RuleTreeToText()
rules = {}

for name, rule_tree, options in self.rule_defs:
for name, rule_tree, options in rule_defs:
assert name not in rules, name
rule_tree = PrepareLiterals().transform(rule_tree)
if not lexer:
@@ -431,9 +454,20 @@ class Grammar:


class RuleOptions:
def __init__(self, keep_all_tokens=False, expand1=False):
def __init__(self, keep_all_tokens=False, expand1=False, join_children=False, filter_out=False):
self.keep_all_tokens = keep_all_tokens
self.expand1 = expand1
self.join_children = join_children # used for scanless postprocessing

self.filter_out = filter_out # remove this rule from the tree
# used for "token"-rules in scanless

@classmethod
def new_from(cls, options, **kw):
return cls(
keep_all_tokens=options and options.keep_all_tokens,
expand1=options and options.expand1,
**kw)

def _extract_options_for_rule(name, expansions):
keep_all_tokens = name.startswith('!')


+ 27
- 8
lark/parse_tree_builder.py View File

@@ -12,11 +12,19 @@ def create_expand1_tree_builder_function(tree_builder):
return tree_builder(children)
return expand1

def create_rule_handler(expansion, usermethod, keep_all_tokens):
def create_join_children(tree_builder):
def join_children(children):
children = [''.join(children)]
return tree_builder(children)
return join_children

def create_rule_handler(expansion, usermethod, keep_all_tokens, filter_out):
# if not keep_all_tokens:
to_include = [(i, not is_terminal(sym) and sym.startswith('_'))
for i, sym in enumerate(expansion)
if keep_all_tokens or not is_terminal(sym) or not sym.startswith('_')]
if keep_all_tokens
or not ((is_terminal(sym) and sym.startswith('_')) or sym in filter_out)
]

if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include):
def _build_ast(match):
@@ -49,9 +57,17 @@ class ParseTreeBuilder:
def create_tree_builder(self, rules, transformer):
callback = Callback()
new_rules = []

filter_out = set()
for origin, (expansions, options) in rules.items():
if options and options.filter_out:
assert origin.startswith('_') # Just to make sure
filter_out.add(origin)

for origin, (expansions, options) in rules.items():
keep_all_tokens = options.keep_all_tokens if options else False
expand1 = options.expand1 if options else False
join_children = options.join_children if options else False

_origin = origin

@@ -59,8 +75,6 @@ class ParseTreeBuilder:
if alias and origin.startswith('_'):
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias))

_alias = 'autoalias_%s_%s' % (_origin, '_'.join(expansion))

try:
f = transformer._get_func(alias or _origin)
except AttributeError:
@@ -71,12 +85,17 @@ class ParseTreeBuilder:
if expand1:
f = create_expand1_tree_builder_function(f)

alias_handler = create_rule_handler(expansion, f, keep_all_tokens)
if join_children:
f = create_join_children(f)


alias_handler = create_rule_handler(expansion, f, keep_all_tokens, filter_out)

if hasattr(callback, _alias):
callback_name = 'autoalias_%s_%s' % (_origin, '_'.join(expansion))
if hasattr(callback, callback_name):
raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin))
setattr(callback, _alias, alias_handler)
setattr(callback, callback_name, alias_handler)

new_rules.append(( _origin, expansion, _alias ))
new_rules.append(( _origin, expansion, callback_name ))

return new_rules, callback

+ 2
- 18
lark/parser_frontends.py View File

@@ -149,16 +149,9 @@ class Nearley_NoLex:

class Earley_NoLex:
def __init__(self, lexer_conf, parser_conf):
self.tokens_to_convert = {name: '__token_'+name for name, tree, _ in parser_conf.rules if is_terminal(name)}
rules = []
for name, exp, alias in parser_conf.rules:
name = self.tokens_to_convert.get(name, name)
exp = [self.tokens_to_convert.get(x, x) for x in exp]
rules.append((name, exp, alias))

self.token_by_name = {t.name:t for t in lexer_conf.tokens}

rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in rules]
rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules]

self.parser = earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start))

@@ -177,16 +170,7 @@ class Earley_NoLex:
new_text = tokenize_text(text)
res = self.parser.parse(new_text)
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
res = res[0]

class RestoreTokens(Transformer):
pass

for t in self.tokens_to_convert:
setattr(RestoreTokens, t, ''.join)

res = RestoreTokens().transform(res)
return res
return res[0]


def get_frontend(parser, lexer):


+ 15
- 4
lark/parsers/earley.py View File

@@ -40,7 +40,7 @@ class Item(object):
def __repr__(self):
before = map(str, self.rule.expansion[:self.ptr])
after = map(str, self.rule.expansion[self.ptr:])
return '<(%d) %s : %s * %s>' % (self.start, self.rule.origin, ' '.join(before), ' '.join(after))
return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after))


class NewsList(list):
@@ -76,9 +76,20 @@ class Column:
for item in items:

if item.is_complete:
# if item in added: # XXX This causes a bug with empty rules
# continue # And might be unnecessary
# added.add(item)

# (We must allow repetition of empty rules)
if item.rule.expansion:

# This is an important test to avoid infinite-loops,
# For example for the rule:
# a: a | "b"
# If we can detect these cases statically, we can remove
# this test an gain a small optimization
#
if item in added:
continue
added.add(item)

self.to_reduce.append(item)
else:
if is_terminal(item.expect):


+ 10
- 2
lark/reconstruct.py View File

@@ -23,12 +23,17 @@ class Reconstructor:
tokens = {t.name:t for t in parser.lexer_conf.tokens}
token_res = {t.name:re.compile(t.pattern.to_regexp()) for t in parser.lexer_conf.tokens}

class MatchData:
class MatchData(object):
def __init__(self, data):
self.data = data

def __repr__(self):
return '%s(%r)' % (type(self).__name__, self.data)

class MatchTerminal(MatchData):
def __call__(self, other):
if isinstance(other, Tree):
return False
return token_res[self.data].match(other) is not None

class MatchTree(MatchData):
@@ -66,8 +71,11 @@ class Reconstructor:

return to_write

# Recreate the rules to assume a standard lexer
_tokens, rules, _grammar_extra = parser.grammar.compile(lexer='standard', start='whatever')

d = defaultdict(list)
for name, (expansions, _o) in parser.rules.items():
for name, (expansions, _o) in rules.items():
for expansion, alias in expansions:
if alias:
d[alias].append(expansion)


Loading…
Cancel
Save