Ver código fonte

Off-by-one fix + Change of thresholds + fix tests

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.12.0
MegaIng 3 anos atrás
pai
commit
fa8565366b
3 arquivos alterados com 37 adições e 19 exclusões
  1. +22
    -7
      lark/load_grammar.py
  2. +5
    -3
      lark/utils.py
  3. +10
    -9
      tests/test_parser.py

+ 22
- 7
lark/load_grammar.py Ver arquivo

@@ -174,9 +174,21 @@ RULES = {
'literal': ['REGEXP', 'STRING'],
}

REPEAT_BREAK_THRESHOLD = 20
REPEAT_BREAK_THRESHOLD = 50
# The Threshold whether repeat via ~ are split up into different rules
# For the moment 20 is arbitrarily chosen
# 50 is chosen since it keeps the number of states low and therefore lalr analysis time low,
# while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts.
# For a grammar of the form start: "A"~0..N, these are the timing stats:
# N t
# 10 0.000
# 20 0.004
# 30 0.016
# 40 0.049
# 50 0.109
# 60 0.215
# 70 0.383
# 80 0.631
# (See PR #949)


@inline_args
@@ -244,6 +256,7 @@ class EBNF_to_BNF(Transformer_InPlace):
| target target_opt
| target target target_opt

| target target target
| target target target atom
| target target target atom atom
| target target target atom atom atom
@@ -251,7 +264,7 @@ class EBNF_to_BNF(Transformer_InPlace):
First we generate target * i followed by target_opt for i from 0 to a-1
These match 0 to n*a - 1 times atom

Then we generate target * a followed by atom * i for i from 1 to b-1
Then we generate target * a followed by atom * i for i from 0 to b-1
These match n*a to n*a + b-1 times atom
"""
key = (a, b, target, atom, "opt")
@@ -264,7 +277,7 @@ class EBNF_to_BNF(Transformer_InPlace):
for i in range(a)
] + [
ST('expansion', [target] * a + [atom] * i)
for i in range(1, b)
for i in range(b)
])
return self._add_rule(key, new_name, tree)

@@ -281,15 +294,17 @@ class EBNF_to_BNF(Transformer_InPlace):
mn_target = self._add_repeat_rule(a, b, mn_target, rule)
if mx == mn:
return mn_target

diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less
diff_factors = small_factors(diff)
diff_target = rule
diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. 1-1 times)
diff_target = rule # Match rule 1 times
diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times)
for a, b in diff_factors[:-1]:
new_diff_target = self._add_repeat_rule(a, b, diff_target, rule)
diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)
diff_target = new_diff_target
a, b = diff_factors[-1]

a, b = diff_factors[-1] # We do the last on separately since we don't need to call self._add_repeat_rule
diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)

return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])])


+ 5
- 3
lark/utils.py Ver arquivo

@@ -361,8 +361,9 @@ def _serialize(value, memo):
return value


# 10 is arbitrarily chosen
SMALL_FACTOR_THRESHOLD = 10
# Value 5 keeps the number of states in the lalr parser somewhat minimal
# It isn't optimal, but close to it. See PR #949
SMALL_FACTOR_THRESHOLD = 5


def small_factors(n):
@@ -380,7 +381,8 @@ def small_factors(n):
assert n >= 0
if n < SMALL_FACTOR_THRESHOLD:
return [(n, 0)]
# TODO: Think of better algorithms (Prime factors should minimize the number of steps)
# While this does not provide an optimal solution, it produces a pretty good one.
# See above comment and PR #949
for a in range(SMALL_FACTOR_THRESHOLD, 1, -1):
b = n % a
if a + b > SMALL_FACTOR_THRESHOLD:


+ 10
- 9
tests/test_parser.py Ver arquivo

@@ -2229,21 +2229,22 @@ def _make_parser_test(LEXER, PARSER):
@unittest.skipIf(PARSER == 'cyk', "For large number of repeats, empty rules might be generated")
def test_ranged_repeat_large(self):
# Large is currently arbitrarily chosen to be large than 20
g = u"""!start: "A"~30
g = u"""!start: "A"~60
"""
l = _Lark(g)
self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated")
self.assertEqual(l.parse(u'A' * 30), Tree('start', ["A"] * 30))
self.assertRaises(ParseError, l.parse, u'A' * 29)
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 31)
self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60))
self.assertRaises(ParseError, l.parse, u'A' * 59)
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61)

g = u"""!start: "A"~0..100
g = u"""!start: "A"~15..100
"""
l = _Lark(g)
self.assertEqual(l.parse(u''), Tree('start', []))
self.assertEqual(l.parse(u'A'), Tree('start', ['A']))
self.assertEqual(l.parse(u'A' * 100), Tree('start', ['A'] * 100))
self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 101)
for i in range(0, 110):
if 15 <= i <= 100:
self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i))
else:
self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * i)

# 8191 is a Mersenne prime
g = u"""start: "A"~8191


Carregando…
Cancelar
Salvar