From e0889a3cf3cd4de596647eee6859eb6667cfc422 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Sun, 29 Aug 2021 23:06:01 +0200 Subject: [PATCH] Sort Options inside a TerminalTree --- lark/load_grammar.py | 4 ++++ tests/test_grammar.py | 4 +--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 4a9360c..abcfce1 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -614,6 +614,10 @@ class TerminalTreeToPattern(Transformer_NonRecursive): if len(exps) == 1: return exps[0] + # Do a bit of sorting to make sure that the longest option is returned + # (Python's re module otherwise prefers just 'l' when given (l|ll) and both could match) + exps.sort(key=lambda x: (-x.max_width, -x.min_width, -len(x.value))) + pattern = '(?:%s)' % ('|'.join(i.to_regexp() for i in exps)) return _make_joined_pattern(pattern, {i.flags for i in exps}) diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 319d709..c771f2b 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -247,10 +247,8 @@ class TestGrammar(TestCase): self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192) def test_large_terminal(self): - # TODO: The `reversed` below is required because otherwise the regex engine is happy - # with just parsing 9 from the string 999 instead of consuming the longest g = "start: NUMBERS\n" - g += "NUMBERS: " + '|'.join('"%s"' % i for i in reversed(range(0, 1000))) + g += "NUMBERS: " + '|'.join('"%s"' % i for i in range(0, 1000)) l = Lark(g, parser='lalr') for i in (0, 9, 99, 999):