From 7d11dfa5cd22b3e7bcc8fa75cb5044bf3669316a Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 10 Feb 2018 19:43:19 +0200 Subject: [PATCH] FEATURE: Added support for ranged-repeat for rules and terminals (Issues #75, #19) Syntax: symbol~number | symbol~min..max Example: HEXCOLOR: "#" (HEXDIGIT~3 | HEXDIGIT~6) short_sentence: word~4..20 Added range for tokens --- lark/load_grammar.py | 28 ++++++++++++++++++++--- tests/test_parser.py | 54 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 85738de..1637514 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -70,6 +70,7 @@ TOKENS = { '_COLON': ':', '_OR': r'\|', '_DOT': r'\.', + 'TILDE': '~', 'RULE': '!?[_?]?[a-z][_a-z0-9]*', 'TOKEN': '_?[A-Z][_A-Z0-9]*', 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', @@ -100,7 +101,10 @@ RULES = { '_expansion': ['', '_expansion expr'], '?expr': ['atom', - 'atom OP'], + 'atom OP', + 'atom TILDE NUMBER', + 'atom TILDE NUMBER _DOT _DOT NUMBER', + ], '?atom': ['_LPAR expansions _RPAR', 'maybe', @@ -146,7 +150,7 @@ class EBNF_to_BNF(InlineTransformer): self.rules_by_expr[expr] = t return t - def expr(self, rule, op): + def expr(self, rule, op, *args): if op.value == '?': return T('expansions', [rule, T('expansion', [])]) elif op.value == '+': @@ -162,6 +166,14 @@ class EBNF_to_BNF(InlineTransformer): # _c : _c c | c; new_name = self._add_recurse_rule('star', rule) return T('expansions', [new_name, T('expansion', [])]) + elif op.value == '~': + if len(args) == 1: + mn = mx = int(args[0]) + else: + mn, mx = map(int, args) + if mx < mn: + raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) + return T('expansions', [T('expansion', [rule] * n) for n in range(mn, mx+1)]) assert False, op @@ -377,7 +389,17 @@ class TokenTreeToPattern(Transformer): return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps)), exps[0].flags) def expr(self, args): - inner, op = args + inner, op = args[:2] + if op == '~': + if len(args) == 3: + op = "{%d}" % int(args[2]) + else: + mn, mx = map(int, args[2:]) + if mx < mn: + raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (inner, mn, mx)) + op = "{%d,%d}" % (mn, mx) + else: + assert len(args) == 2 return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags) diff --git a/tests/test_parser.py b/tests/test_parser.py index ed716bb..89b9d69 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1074,6 +1074,60 @@ def _make_parser_test(LEXER, PARSER): _Lark(r'start: "\\\t"').parse('\\\t') + def test_ranged_repeat_rules(self): + g = u"""!start: "A"~3 + """ + l = _Lark(g) + self.assertEqual(l.parse(u'AAA'), Tree('start', ["A", "A", "A"])) + self.assertRaises(ParseError, l.parse, u'AA') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA') + + + g = u"""!start: "A"~0..2 + """ + if PARSER != 'cyk': # XXX CYK currently doesn't support empty grammars + l = _Lark(g) + self.assertEqual(l.parse(u''), Tree('start', [])) + self.assertEqual(l.parse(u'A'), Tree('start', ['A'])) + self.assertEqual(l.parse(u'AA'), Tree('start', ['A', 'A'])) + self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'AAA') + + g = u"""!start: "A"~3..2 + """ + self.assertRaises(GrammarError, _Lark, g) + + g = u"""!start: "A"~2..3 "B"~2 + """ + l = _Lark(g) + self.assertEqual(l.parse(u'AABB'), Tree('start', ['A', 'A', 'B', 'B'])) + self.assertEqual(l.parse(u'AAABB'), Tree('start', ['A', 'A', 'A', 'B', 'B'])) + self.assertRaises(ParseError, l.parse, u'AAAB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') + + + def test_ranged_repeat_terms(self): + g = u"""!start: AAA + AAA: "A"~3 + """ + l = _Lark(g) + self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA') + + g = u"""!start: AABB CC + AABB: "A"~0..2 "B"~2 + CC: "C"~1..2 + """ + l = _Lark(g) + self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC'])) + self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C'])) + self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC'])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')