From 794a1c496519304598ac4d7daf250d93e41ad70c Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Thu, 9 Mar 2017 19:10:01 +0200 Subject: [PATCH] More flags work --- lark/common.py | 2 +- lark/lexer.py | 15 +++++++++------ lark/load_grammar.py | 7 +++++-- tests/test_parser.py | 17 +++++++++++++++++ 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/lark/common.py b/lark/common.py index 93186e9..2c940bd 100644 --- a/lark/common.py +++ b/lark/common.py @@ -53,7 +53,7 @@ class Pattern(object): self.flags = flags def __repr__(self): - return repr(self.value) + return repr(self._get_flags() + self.value) # Pattern Hashing assumes all subclasses have a different priority! def __hash__(self): diff --git a/lark/lexer.py b/lark/lexer.py index 92decb9..053ce32 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -58,6 +58,7 @@ def _create_unless_callback(strs): if m: value = m.group(0) t.type = type_from_index[m.lastindex] + break return t return unless_callback @@ -65,20 +66,22 @@ def _create_unless(tokens): tokens_by_type = classify(tokens, lambda t: type(t.pattern)) assert len(tokens_by_type) <= 2, tokens_by_type.keys() embedded_strs = set() + delayed_strs = [] callback = {} for retok in tokens_by_type.get(PatternRE, []): unless = [] # {} for strtok in tokens_by_type.get(PatternStr, []): s = strtok.pattern.value - m = re.match(retok.pattern.value, s) + m = re.match(retok.pattern.to_regexp(), s) if m and m.group(0) == s: + if strtok.pattern.flags: + delayed_strs.append(strtok) embedded_strs.add(strtok.name) - #unless[s] = strtok.name unless.append(strtok) if unless: callback[retok.name] = _create_unless_callback(unless) - tokens = [t for t in tokens if t.name not in embedded_strs] + tokens = [t for t in tokens if t.name not in embedded_strs] + delayed_strs return tokens, callback @@ -90,7 +93,7 @@ def _build_mres(tokens, max_size, match_whole): mres = [] while tokens: try: - mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()) for t in tokens[:max_size])+postfix) + mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in tokens[:max_size])) except AssertionError: # Yes, this is what Python provides us.. :/ return _build_mres(tokens, max_size//2, match_whole) @@ -130,11 +133,11 @@ class Lexer(object): self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] self.ignore_types = [t for t in ignore] + tokens.sort(key=lambda x:(x.pattern.priority, len(x.pattern.value)), reverse=True) + tokens, self.callback = _create_unless(tokens) assert all(self.callback.values()) - tokens.sort(key=lambda x:(x.pattern.priority, len(x.pattern.value)), reverse=True) - self.tokens = tokens self.mres = build_mres(tokens) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 633d07e..9c8ef85 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -324,16 +324,19 @@ class TokenTreeToPattern(Transformer): def expansion(self, items): if len(items) == 1: return items[0] - return PatternRE(''.join(i.to_regexp() for i in items)) + if len(set(i.flags for i in items)) > 1: + raise GrammarError("Lark doesn't support joining tokens with conflicting flags!") + return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags) def expansions(self, exps): if len(exps) == 1: return exps[0] + assert all(i.flags is None for i in exps) return PatternRE('(?:%s)' % ('|'.join(i.to_regexp() for i in exps))) def expr(self, args): inner, op = args - return PatternRE('(?:%s)%s' % (inner.to_regexp(), op)) + return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags) def interleave(l, item): diff --git a/tests/test_parser.py b/tests/test_parser.py index 00d6caf..83085e2 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -525,6 +525,23 @@ def _make_parser_test(LEXER, PARSER): """ self.assertRaises(GrammarError, _Lark, g) + g = """start: NAME "," "a" + NAME: /[a-z_]/i /[a-z0-9_]/i* + """ + l = _Lark(g) + tree = l.parse('ab,a') + self.assertEqual(tree.children, ['ab']) + tree = l.parse('AB,a') + self.assertEqual(tree.children, ['AB']) + + def test_token_flags2(self): + g = """!start: ("a"i | /a/ /b/?)+ + """ + l = _Lark(g) + tree = l.parse('aA') + self.assertEqual(tree.children, ['a', 'A']) + +