From 34acc4674e4d8ff3bcd77b11ca507123ff334460 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Thu, 9 Mar 2017 18:15:55 +0200 Subject: [PATCH] Added flag options to terminals (just 'i' for now) --- lark/common.py | 13 ++++++++++--- lark/load_grammar.py | 27 ++++++++++++++++++--------- tests/test_parser.py | 25 ++++++++++++++++++++++++- 3 files changed, 52 insertions(+), 13 deletions(-) diff --git a/lark/common.py b/lark/common.py index 9a19fd3..93186e9 100644 --- a/lark/common.py +++ b/lark/common.py @@ -48,8 +48,9 @@ class ParserConf: class Pattern(object): - def __init__(self, value): + def __init__(self, value, flags=None): self.value = value + self.flags = flags def __repr__(self): return repr(self.value) @@ -60,15 +61,21 @@ class Pattern(object): def __eq__(self, other): return self.priority == other.priority and self.value == other.value + def _get_flags(self): + if self.flags: + assert len(self.flags) == 1 + return '(?%s)' % self.flags + return '' + class PatternStr(Pattern): def to_regexp(self): - return re.escape(self.value) + return self._get_flags() + re.escape(self.value) priority = 0 class PatternRE(Pattern): def to_regexp(self): - return self.value + return self._get_flags() + self.value priority = 1 diff --git a/lark/load_grammar.py b/lark/load_grammar.py index ac947ec..633d07e 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -67,8 +67,8 @@ TOKENS = { '_DOT': r'\.', 'RULE': '!?[_?]?[a-z][_a-z0-9]*', 'TOKEN': '_?[A-Z][_A-Z0-9]*', - 'STRING': r'"(\\"|\\\\|[^"\n])*?"', - 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/', + 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', + 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/i?', '_NL': r'(\r?\n)+\s*', 'WS': r'[ \t]+', 'COMMENT': r'//[^\n]*', @@ -234,16 +234,18 @@ class ExtractAnonTokens(InlineTransformer): def __init__(self, tokens): self.tokens = tokens self.token_set = {td.name for td in self.tokens} - self.token_reverse = {td.pattern: td.name for td in tokens} + self.token_reverse = {td.pattern: td for td in tokens} self.i = 0 def pattern(self, p): value = p.value + if p in self.token_reverse and p.flags != self.token_reverse[p].pattern.flags: + raise GrammarError(u'Conflicting flags for the same terminal: %s' % p) if isinstance(p, PatternStr): try: # If already defined, use the user-defined token name - token_name = self.token_reverse[p] + token_name = self.token_reverse[p].name except KeyError: # Try to assign an indicative anon-token name, otherwise use a numbered name try: @@ -263,8 +265,8 @@ class ExtractAnonTokens(InlineTransformer): token_name = '__' + token_name elif isinstance(p, PatternRE): - if p in self.token_reverse: # Kind of a wierd placement - token_name = self.token_reverse[p] + if p in self.token_reverse: # Kind of a wierd placement.name + token_name = self.token_reverse[p].name else: token_name = 'ANONRE_%d' % self.i self.i += 1 @@ -274,19 +276,26 @@ class ExtractAnonTokens(InlineTransformer): if token_name not in self.token_set: assert p not in self.token_reverse self.token_set.add(token_name) - self.token_reverse[p] = token_name - self.tokens.append(TokenDef(token_name, p)) + tokendef = TokenDef(token_name, p) + self.token_reverse[p] = tokendef + self.tokens.append(tokendef) return Token('TOKEN', token_name, -1) def _literal_to_pattern(literal): v = literal.value + if v[-1] in 'i': + flags = v[-1] + v = v[:-1] + else: + flags = None + assert v[0] == v[-1] and v[0] in '"/' x = v[1:-1].replace("'", r"\'") s = literal_eval("u'''%s'''" % x) return { 'STRING': PatternStr, - 'REGEXP': PatternRE }[literal.type](s) + 'REGEXP': PatternRE }[literal.type](s, flags) class PrepareLiterals(InlineTransformer): diff --git a/tests/test_parser.py b/tests/test_parser.py index d3bf394..00d6caf 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -67,7 +67,6 @@ class TestEarley(unittest.TestCase): """, parser="earley", lexer=None) x = g.parse('aaaababc') - def test_earley_scanless2(self): grammar = """ start: statement+ @@ -505,6 +504,30 @@ def _make_parser_test(LEXER, PARSER): g.parse("+2e-9") self.assertRaises(ParseError, g.parse, "+2e-9e") + def test_token_flags(self): + l = _Lark("""!start: "a"i+ + """ + ) + tree = l.parse('aA') + self.assertEqual(tree.children, ['a', 'A']) + + l = _Lark("""!start: /a/i+ + """ + ) + tree = l.parse('aA') + self.assertEqual(tree.children, ['a', 'A']) + + g = """!start: "a"i "a" + """ + self.assertRaises(GrammarError, _Lark, g) + + g = """!start: /a/i /a/ + """ + self.assertRaises(GrammarError, _Lark, g) + + + + _NAME = "Test" + PARSER.capitalize() + (LEXER or 'Scanless').capitalize() _TestParser.__name__ = _NAME globals()[_NAME] = _TestParser