Browse Source

Fixed token visibility rules (Issue #109)

Anonymous tokens would become visible if they had the same value as named tokens.
That's because they are merged for the lexer. But after this change, the rules for
visibility are based on their use in the rule, and not their name or identity.
tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.6.0
Erez Shinan 6 years ago
parent
commit
7b32ffd83a
5 changed files with 27 additions and 37 deletions
  1. +6
    -8
      lark/grammar.py
  2. +7
    -4
      lark/load_grammar.py
  3. +4
    -7
      lark/parse_tree_builder.py
  4. +2
    -7
      lark/parser_frontends.py
  5. +8
    -11
      tests/test_parser.py

+ 6
- 8
lark/grammar.py View File

@@ -20,9 +20,10 @@ class Symbol(object):
class Terminal(Symbol): class Terminal(Symbol):
is_term = True is_term = True


@property
def filter_out(self):
return self.name.startswith('_')
def __init__(self, name, filter_out=False):
self.name = name
self.filter_out = filter_out



class NonTerminal(Symbol): class NonTerminal(Symbol):
is_term = False is_term = False
@@ -46,17 +47,14 @@ class Rule(object):




class RuleOptions: class RuleOptions:
def __init__(self, keep_all_tokens=False, expand1=False, filter_out=False, priority=None):
def __init__(self, keep_all_tokens=False, expand1=False, priority=None):
self.keep_all_tokens = keep_all_tokens self.keep_all_tokens = keep_all_tokens
self.expand1 = expand1 self.expand1 = expand1
self.priority = priority self.priority = priority


self.filter_out = filter_out # remove this rule from the tree

def __repr__(self): def __repr__(self):
return 'RuleOptions(%r, %r, %r, %r)' % (
return 'RuleOptions(%r, %r, %r)' % (
self.keep_all_tokens, self.keep_all_tokens,
self.expand1, self.expand1,
self.priority, self.priority,
self.filter_out
) )

+ 7
- 4
lark/load_grammar.py View File

@@ -307,7 +307,7 @@ class ExtractAnonTokens(InlineTransformer):
self.token_reverse[p] = tokendef self.token_reverse[p] = tokendef
self.tokens.append(tokendef) self.tokens.append(tokendef)


return Terminal(Token('TOKEN', token_name, -1))
return Terminal(Token('TOKEN', token_name, -1), filter_out=isinstance(p, PatternStr))




def _rfind(s, choices): def _rfind(s, choices):
@@ -415,8 +415,11 @@ class PrepareSymbols(Transformer):
v ,= v v ,= v
if isinstance(v, Tree): if isinstance(v, Tree):
return v return v
return {'TOKEN': Terminal,
'RULE': NonTerminal}[v.type](v.value)
elif v.type == 'RULE':
return NonTerminal(v.value)
elif v.type == 'TOKEN':
return Terminal(v.value, filter_out=v.startswith('_'))
assert False


def _choice_of_rules(rules): def _choice_of_rules(rules):
return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules]) return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules])
@@ -532,7 +535,7 @@ def options_from_rule(name, *x):




def symbols_from_strcase(expansion): def symbols_from_strcase(expansion):
return [Terminal(x) if is_terminal(x) else NonTerminal(x) for x in expansion]
return [Terminal(x, filter_out=x.startswith('_')) if is_terminal(x) else NonTerminal(x) for x in expansion]


class PrepareGrammar(InlineTransformer): class PrepareGrammar(InlineTransformer):
def terminal(self, name): def terminal(self, name):


+ 4
- 7
lark/parse_tree_builder.py View File

@@ -75,8 +75,9 @@ class ChildFilterLALR(ChildFilter):
def _should_expand(sym): def _should_expand(sym):
return not sym.is_term and sym.name.startswith('_') return not sym.is_term and sym.name.startswith('_')


def maybe_create_child_filter(expansion, filter_out, ambiguous):
to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) if sym not in filter_out]
def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous):
to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion)
if keep_all_tokens or not (sym.is_term and sym.filter_out)]


if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include):
return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include) return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include)
@@ -97,10 +98,6 @@ class ParseTreeBuilder:
self.user_aliases = {} self.user_aliases = {}


def _init_builders(self, rules): def _init_builders(self, rules):
filter_out = {rule.origin for rule in rules if rule.options and rule.options.filter_out}
filter_out |= {sym for rule in rules for sym in rule.expansion if sym.is_term and sym.filter_out}
assert all(t.name.startswith('_') for t in filter_out)

for rule in rules: for rule in rules:
options = rule.options options = rule.options
keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False)
@@ -108,7 +105,7 @@ class ParseTreeBuilder:


wrapper_chain = filter(None, [ wrapper_chain = filter(None, [
(expand_single_child and not rule.alias) and ExpandSingleChild, (expand_single_child and not rule.alias) and ExpandSingleChild,
maybe_create_child_filter(rule.expansion, () if keep_all_tokens else filter_out, self.ambiguous),
maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous),
self.propagate_positions and PropagatePositions, self.propagate_positions and PropagatePositions,
]) ])




+ 2
- 7
lark/parser_frontends.py View File

@@ -7,11 +7,6 @@ from .lexer import Lexer, ContextualLexer, Token
from .common import GrammarError from .common import GrammarError
from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk
from .tree import Tree from .tree import Tree
from .grammar import Terminal, NonTerminal

def terminals(seq):
# return [Terminal(t) for t in seq]
return seq


class WithLexer: class WithLexer:
def init_traditional_lexer(self, lexer_conf): def init_traditional_lexer(self, lexer_conf):
@@ -23,8 +18,8 @@ class WithLexer:
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else ()
self.lexer = ContextualLexer(lexer_conf.tokens, states, self.lexer = ContextualLexer(lexer_conf.tokens, states,
ignore=terminals(lexer_conf.ignore),
always_accept=terminals(always_accept),
ignore=lexer_conf.ignore,
always_accept=always_accept,
user_callbacks=lexer_conf.callbacks) user_callbacks=lexer_conf.callbacks)


def lex(self, text): def lex(self, text):


+ 8
- 11
tests/test_parser.py View File

@@ -649,28 +649,25 @@ def _make_parser_test(LEXER, PARSER):
x = g.parse('b') x = g.parse('b')


def test_token_not_anon(self): def test_token_not_anon(self):
"""Tests that "a" is matched as A, rather than an anonymous token.

That means that "a" is not filtered out, despite being an 'immediate string'.
Whether or not this is the intuitive behavior, I'm not sure yet.

Perhaps the right thing to do is report a collision (if such is relevant)

-Erez
"""Tests that "a" is matched as an anonymous token, and not A.
""" """


g = _Lark("""start: "a" g = _Lark("""start: "a"
A: "a" """) A: "a" """)
x = g.parse('a') x = g.parse('a')
self.assertEqual(len(x.children), 0, '"a" should be considered anonymous')


self.assertEqual(len(x.children), 1, '"a" should not be considered anonymous')
g = _Lark("""start: "a" A
A: "a" """)
x = g.parse('aa')
self.assertEqual(len(x.children), 1, 'only "a" should be considered anonymous')
self.assertEqual(x.children[0].type, "A") self.assertEqual(x.children[0].type, "A")


g = _Lark("""start: /a/ g = _Lark("""start: /a/
A: /a/ """) A: /a/ """)
x = g.parse('a') x = g.parse('a')
self.assertEqual(len(x.children), 1, '/a/ should not be considered anonymous')
self.assertEqual(x.children[0].type, "A")
self.assertEqual(len(x.children), 1)
self.assertEqual(x.children[0].type, "A", "A isn't associated with /a/")


@unittest.skipIf(PARSER == 'cyk', "No empty rules") @unittest.skipIf(PARSER == 'cyk', "No empty rules")
def test_maybe(self): def test_maybe(self):


Loading…
Cancel
Save