Преглед на файлове

Refactored PR #949 and edited the comments/docstrings

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.12.0
Erez Sh преди 3 години
родител
ревизия
90460f31d9
променени са 3 файла, в които са добавени 60 реда и са изтрити 69 реда
  1. +47
    -53
      lark/load_grammar.py
  2. +9
    -12
      lark/utils.py
  3. +4
    -4
      tests/test_parser.py

+ 47
- 53
lark/load_grammar.py Целия файл

@@ -174,21 +174,15 @@ RULES = {
'literal': ['REGEXP', 'STRING'],
}

REPEAT_BREAK_THRESHOLD = 50

# Value 5 keeps the number of states in the lalr parser somewhat minimal
# It isn't optimal, but close to it. See PR #949
SMALL_FACTOR_THRESHOLD = 5
# The Threshold whether repeat via ~ are split up into different rules
# 50 is chosen since it keeps the number of states low and therefore lalr analysis time low,
# while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts.
# For a grammar of the form start: "A"~0..N, these are the timing stats:
# N t
# 10 0.000
# 20 0.004
# 30 0.016
# 40 0.049
# 50 0.109
# 60 0.215
# 70 0.383
# 80 0.631
# (See PR #949)
REPEAT_BREAK_THRESHOLD = 50


@inline_args
@@ -224,17 +218,16 @@ class EBNF_to_BNF(Transformer_InPlace):
return self._add_rule(expr, new_name, tree)

def _add_repeat_rule(self, a, b, target, atom):
"""
When target matches n times atom
This builds a rule that matches atom (a*n + b) times
"""Generate a rule that repeats target ``a`` times, and repeats atom ``b`` times.

The rule is of the form:
When called recursively (into target), it repeats atom for x(n) times, where:
x(0) = 1
x(n) = a(n) * x(n-1) + b

The rules are of the form: (Example a = 3, b = 4)
Example rule when a=3, b=4:

new_rule: target target target atom atom atom atom
new_rule: target target target atom atom atom atom

e.g. we use target * a and atom * b
"""
key = (a, b, target, atom)
try:
@@ -245,27 +238,29 @@ class EBNF_to_BNF(Transformer_InPlace):
return self._add_rule(key, new_name, tree)

def _add_repeat_opt_rule(self, a, b, target, target_opt, atom):
"""
"""Creates a rule that matches atom 0 to (a*n+b)-1 times.

When target matches n times atom, and target_opt 0 to n-1 times target_opt,
This builds a rule that matches atom 0 to (a*n+b)-1 times.
The created rule will not have any shift/reduce conflicts so that it can be used with lalr

The rules are of the form: (Example a = 3, b = 4)
First we generate target * i followed by target_opt, for i from 0 to a-1
These match 0 to n*a - 1 times atom

Then we generate target * a followed by atom * i, for i from 0 to b-1
These match n*a to n*a + b-1 times atom

new_rule: target_opt
| target target_opt
| target target target_opt
The created rule will not have any shift/reduce conflicts so that it can be used with lalr

| target target target
| target target target atom
| target target target atom atom
| target target target atom atom atom
Example rule when a=3, b=4:

First we generate target * i followed by target_opt for i from 0 to a-1
These match 0 to n*a - 1 times atom
new_rule: target_opt
| target target_opt
| target target target_opt

| target target target
| target target target atom
| target target target atom atom
| target target target atom atom atom

Then we generate target * a followed by atom * i for i from 0 to b-1
These match n*a to n*a + b-1 times atom
"""
key = (a, b, target, atom, "opt")
try:
@@ -273,38 +268,39 @@ class EBNF_to_BNF(Transformer_InPlace):
except KeyError:
new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b))
tree = ST('expansions', [
ST('expansion', [target] * i + [target_opt])
for i in range(a)
ST('expansion', [target]*i + [target_opt]) for i in range(a)
] + [
ST('expansion', [target] * a + [atom] * i)
for i in range(b)
ST('expansion', [target]*a + [atom]*i) for i in range(b)
])
return self._add_rule(key, new_name, tree)

def _generate_repeats(self, rule, mn, mx):
"""Generates a rule tree that repeats ``rule`` exactly between ``mn`` to ``mx`` times.
"""
We treat rule~mn..mx as rule~mn rule~0..(diff=mx-mn).
We then use small_factors to split up mn and diff up into values [(a, b), ...]
This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt
to generate a complete rule/expression that matches the corresponding number of repeats
"""
mn_factors = small_factors(mn)
# For a small number of repeats, we can take the naive approach
if mx < REPEAT_BREAK_THRESHOLD:
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)])

# For large repeat values, we break the repetition into sub-rules.
# We treat ``rule~mn..mx`` as ``rule~mn rule~0..(diff=mx-mn)``.
# We then use small_factors to split up mn and diff up into values [(a, b), ...]
# This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt
# to generate a complete rule/expression that matches the corresponding number of repeats
mn_target = rule
for a, b in mn_factors:
for a, b in small_factors(mn, SMALL_FACTOR_THRESHOLD):
mn_target = self._add_repeat_rule(a, b, mn_target, rule)
if mx == mn:
return mn_target

diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less
diff_factors = small_factors(diff)
diff_factors = small_factors(diff, SMALL_FACTOR_THRESHOLD)
diff_target = rule # Match rule 1 times
diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times)
for a, b in diff_factors[:-1]:
new_diff_target = self._add_repeat_rule(a, b, diff_target, rule)
diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)
diff_target = new_diff_target
diff_target = self._add_repeat_rule(a, b, diff_target, rule)

a, b = diff_factors[-1] # We do the last on separately since we don't need to call self._add_repeat_rule
a, b = diff_factors[-1]
diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)

return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])])
@@ -333,11 +329,9 @@ class EBNF_to_BNF(Transformer_InPlace):
mn, mx = map(int, args)
if mx < mn or mn < 0:
raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
# For small number of repeats, we don't need to build new rules.
if mx > REPEAT_BREAK_THRESHOLD:
return self._generate_repeats(rule, mn, mx)
else:
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)])

return self._generate_repeats(rule, mn, mx)

assert False, op

def maybe(self, rule):


+ 9
- 12
lark/utils.py Целия файл

@@ -361,14 +361,11 @@ def _serialize(value, memo):
return value


# Value 5 keeps the number of states in the lalr parser somewhat minimal
# It isn't optimal, but close to it. See PR #949
SMALL_FACTOR_THRESHOLD = 5


def small_factors(n):
def small_factors(n, max_factor):
"""
Splits n up into smaller factors and summands <= SMALL_FACTOR_THRESHOLD.
Splits n up into smaller factors and summands <= max_factor.
Returns a list of [(a, b), ...]
so that the following code returns n:

@@ -376,15 +373,15 @@ def small_factors(n):
for a, b in values:
n = n * a + b

Currently, we also keep a + b <= SMALL_FACTOR_THRESHOLD, but that might change
Currently, we also keep a + b <= max_factor, but that might change
"""
assert n >= 0
if n <= SMALL_FACTOR_THRESHOLD:
assert max_factor > 2
if n <= max_factor:
return [(n, 0)]
# While this does not provide an optimal solution, it produces a pretty good one.
# See above comment and PR #949
for a in range(SMALL_FACTOR_THRESHOLD, 1, -1):

for a in range(max_factor, 1, -1):
r, b = divmod(n, a)
if a + b <= SMALL_FACTOR_THRESHOLD:
return small_factors(r) + [(a, b)]
if a + b <= max_factor:
return small_factors(r, max_factor) + [(a, b)]
assert False, "Failed to factorize %s" % n

+ 4
- 4
tests/test_parser.py Целия файл

@@ -2226,7 +2226,7 @@ def _make_parser_test(LEXER, PARSER):
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')

@unittest.skipIf(PARSER == 'cyk', "For large number of repeats, empty rules might be generated")
@unittest.skipIf(PARSER != 'lalr', "We only need to test rule generation, we know BNF is solid on all parsers")
def test_ranged_repeat_large(self):
# Large is currently arbitrarily chosen to be large than 20
g = u"""!start: "A"~60
@@ -2244,15 +2244,15 @@ def _make_parser_test(LEXER, PARSER):
if 15 <= i <= 100:
self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i))
else:
self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * i)
self.assertRaises(UnexpectedInput, l.parse, u'A' * i)

# 8191 is a Mersenne prime
g = u"""start: "A"~8191
"""
l = _Lark(g)
self.assertEqual(l.parse(u'A' * 8191), Tree('start', []))
self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8190)
self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8192)
self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190)
self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192)


@unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX


Зареждане…
Отказ
Запис