Browse Source

Fixed token visibility rules (Issue #109)

Anonymous tokens would become visible if they had the same value as named tokens.
That's because they are merged for the lexer. But after this change, the rules for
visibility are based on their use in the rule, and not their name or identity.
tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.6.0
Erez Shinan 6 years ago
parent
commit
7b32ffd83a
5 changed files with 27 additions and 37 deletions
  1. +6
    -8
      lark/grammar.py
  2. +7
    -4
      lark/load_grammar.py
  3. +4
    -7
      lark/parse_tree_builder.py
  4. +2
    -7
      lark/parser_frontends.py
  5. +8
    -11
      tests/test_parser.py

+ 6
- 8
lark/grammar.py View File

@@ -20,9 +20,10 @@ class Symbol(object):
class Terminal(Symbol):
is_term = True

@property
def filter_out(self):
return self.name.startswith('_')
def __init__(self, name, filter_out=False):
self.name = name
self.filter_out = filter_out


class NonTerminal(Symbol):
is_term = False
@@ -46,17 +47,14 @@ class Rule(object):


class RuleOptions:
def __init__(self, keep_all_tokens=False, expand1=False, filter_out=False, priority=None):
def __init__(self, keep_all_tokens=False, expand1=False, priority=None):
self.keep_all_tokens = keep_all_tokens
self.expand1 = expand1
self.priority = priority

self.filter_out = filter_out # remove this rule from the tree

def __repr__(self):
return 'RuleOptions(%r, %r, %r, %r)' % (
return 'RuleOptions(%r, %r, %r)' % (
self.keep_all_tokens,
self.expand1,
self.priority,
self.filter_out
)

+ 7
- 4
lark/load_grammar.py View File

@@ -307,7 +307,7 @@ class ExtractAnonTokens(InlineTransformer):
self.token_reverse[p] = tokendef
self.tokens.append(tokendef)

return Terminal(Token('TOKEN', token_name, -1))
return Terminal(Token('TOKEN', token_name, -1), filter_out=isinstance(p, PatternStr))


def _rfind(s, choices):
@@ -415,8 +415,11 @@ class PrepareSymbols(Transformer):
v ,= v
if isinstance(v, Tree):
return v
return {'TOKEN': Terminal,
'RULE': NonTerminal}[v.type](v.value)
elif v.type == 'RULE':
return NonTerminal(v.value)
elif v.type == 'TOKEN':
return Terminal(v.value, filter_out=v.startswith('_'))
assert False

def _choice_of_rules(rules):
return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules])
@@ -532,7 +535,7 @@ def options_from_rule(name, *x):


def symbols_from_strcase(expansion):
return [Terminal(x) if is_terminal(x) else NonTerminal(x) for x in expansion]
return [Terminal(x, filter_out=x.startswith('_')) if is_terminal(x) else NonTerminal(x) for x in expansion]

class PrepareGrammar(InlineTransformer):
def terminal(self, name):


+ 4
- 7
lark/parse_tree_builder.py View File

@@ -75,8 +75,9 @@ class ChildFilterLALR(ChildFilter):
def _should_expand(sym):
return not sym.is_term and sym.name.startswith('_')

def maybe_create_child_filter(expansion, filter_out, ambiguous):
to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) if sym not in filter_out]
def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous):
to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion)
if keep_all_tokens or not (sym.is_term and sym.filter_out)]

if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include):
return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include)
@@ -97,10 +98,6 @@ class ParseTreeBuilder:
self.user_aliases = {}

def _init_builders(self, rules):
filter_out = {rule.origin for rule in rules if rule.options and rule.options.filter_out}
filter_out |= {sym for rule in rules for sym in rule.expansion if sym.is_term and sym.filter_out}
assert all(t.name.startswith('_') for t in filter_out)

for rule in rules:
options = rule.options
keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False)
@@ -108,7 +105,7 @@ class ParseTreeBuilder:

wrapper_chain = filter(None, [
(expand_single_child and not rule.alias) and ExpandSingleChild,
maybe_create_child_filter(rule.expansion, () if keep_all_tokens else filter_out, self.ambiguous),
maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous),
self.propagate_positions and PropagatePositions,
])



+ 2
- 7
lark/parser_frontends.py View File

@@ -7,11 +7,6 @@ from .lexer import Lexer, ContextualLexer, Token
from .common import GrammarError
from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk
from .tree import Tree
from .grammar import Terminal, NonTerminal

def terminals(seq):
# return [Terminal(t) for t in seq]
return seq

class WithLexer:
def init_traditional_lexer(self, lexer_conf):
@@ -23,8 +18,8 @@ class WithLexer:
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else ()
self.lexer = ContextualLexer(lexer_conf.tokens, states,
ignore=terminals(lexer_conf.ignore),
always_accept=terminals(always_accept),
ignore=lexer_conf.ignore,
always_accept=always_accept,
user_callbacks=lexer_conf.callbacks)

def lex(self, text):


+ 8
- 11
tests/test_parser.py View File

@@ -649,28 +649,25 @@ def _make_parser_test(LEXER, PARSER):
x = g.parse('b')

def test_token_not_anon(self):
"""Tests that "a" is matched as A, rather than an anonymous token.

That means that "a" is not filtered out, despite being an 'immediate string'.
Whether or not this is the intuitive behavior, I'm not sure yet.

Perhaps the right thing to do is report a collision (if such is relevant)

-Erez
"""Tests that "a" is matched as an anonymous token, and not A.
"""

g = _Lark("""start: "a"
A: "a" """)
x = g.parse('a')
self.assertEqual(len(x.children), 0, '"a" should be considered anonymous')

self.assertEqual(len(x.children), 1, '"a" should not be considered anonymous')
g = _Lark("""start: "a" A
A: "a" """)
x = g.parse('aa')
self.assertEqual(len(x.children), 1, 'only "a" should be considered anonymous')
self.assertEqual(x.children[0].type, "A")

g = _Lark("""start: /a/
A: /a/ """)
x = g.parse('a')
self.assertEqual(len(x.children), 1, '/a/ should not be considered anonymous')
self.assertEqual(x.children[0].type, "A")
self.assertEqual(len(x.children), 1)
self.assertEqual(x.children[0].type, "A", "A isn't associated with /a/")

@unittest.skipIf(PARSER == 'cyk', "No empty rules")
def test_maybe(self):


Loading…
Cancel
Save