From 9923987e94547ded8a17d7a03840c4cebce39188 Mon Sep 17 00:00:00 2001 From: decorator-factory <42166884+decorator-factory@users.noreply.github.com> Date: Mon, 10 Aug 2020 23:07:55 +0300 Subject: [PATCH 1/3] allow multiline regexes with 'x' (verbose) flag --- lark/load_grammar.py | 13 ++++++++++--- tests/test_parser.py | 26 ++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index ae7ec32..d716ec1 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -13,7 +13,7 @@ from .parser_frontends import LALR_TraditionalLexer from .common import LexerConf, ParserConf from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify, suppress, dedup_list, Str -from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken +from .exceptions import GrammarError, LarkError, UnexpectedCharacters, UnexpectedToken from .tree import Tree, SlottedTree as ST from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive @@ -85,7 +85,7 @@ TERMINALS = { 'RULE': '!?[_?]?[a-z][_a-z0-9]*', 'TERMINAL': '_?[A-Z][_A-Z0-9]*', 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', - 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS, + 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS, '_NL': r'(\r?\n)+\s*', 'WS': r'[ \t]+', 'COMMENT': r'\s*//[^\n]*', @@ -336,7 +336,7 @@ class PrepareAnonTerminals(Transformer_InPlace): term_name = None elif isinstance(p, PatternRE): - if p in self.term_reverse: # Kind of a wierd placement.name + if p in self.term_reverse: # Kind of a weird placement.name term_name = self.term_reverse[p].name else: assert False, p @@ -409,6 +409,13 @@ def _literal_to_pattern(literal): flags = v[flag_start:] assert all(f in _RE_FLAGS for f in flags), flags + if literal.type == 'STRING' and '\n' in v: + raise GrammarError('You cannot put newlines in string literals') + + if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags: + raise GrammarError('You can only use newlines in regular expressions ' + 'with the `x` (verbose) flag') + v = v[:flag_start] assert v[0] == v[-1] and v[0] in '"/' x = v[1:-1] diff --git a/tests/test_parser.py b/tests/test_parser.py index cd3ea4d..48a4674 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1262,6 +1262,32 @@ def _make_parser_test(LEXER, PARSER): tree = l.parse('aA') self.assertEqual(tree.children, ['a', 'A']) + def test_token_flags_verbose(self): + g = _Lark(r"""start: NL | ABC + ABC: / [a-z] /x + NL: /\n/ + """) + x = g.parse('a') + self.assertEqual(x.children, ['a']) + + def test_token_flags_verbose_multiline(self): + g = _Lark(r"""start: ABC + ABC: / a b c + d + e f + /x + """) + x = g.parse('abcdef') + self.assertEqual(x.children, ['abcdef']) + + def test_token_multiline_only_works_with_x_flag(self): + g = r"""start: ABC + ABC: / a b c + d + e f + /i + """ + self.assertRaises( GrammarError, _Lark, g) @unittest.skipIf(PARSER == 'cyk', "No empty rules") def test_twice_empty(self): From 8b59a1642533f1f577b104c7be33f0511193050d Mon Sep 17 00:00:00 2001 From: decorator-factory <42166884+decorator-factory@users.noreply.github.com> Date: Tue, 11 Aug 2020 00:44:23 +0300 Subject: [PATCH 2/3] refactor: replace dict lookup with simple conditional --- lark/load_grammar.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index d716ec1..1a1a396 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -424,9 +424,11 @@ def _literal_to_pattern(literal): if literal.type == 'STRING': s = s.replace('\\\\', '\\') - - return { 'STRING': PatternStr, - 'REGEXP': PatternRE }[literal.type](s, flags) + return PatternStr(s, flags) + elif literal.type == 'REGEXP': + return PatternRE(s, flags) + else: + assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]' @inline_args From 2525e0ce9c594b81a79caa5ff57c66a12a79ca5a Mon Sep 17 00:00:00 2001 From: decorator-factory <42166884+decorator-factory@users.noreply.github.com> Date: Tue, 11 Aug 2020 00:46:54 +0300 Subject: [PATCH 3/3] formatting: fix pistol operator --- lark/load_grammar.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 1a1a396..0ee546c 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -13,7 +13,7 @@ from .parser_frontends import LALR_TraditionalLexer from .common import LexerConf, ParserConf from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify, suppress, dedup_list, Str -from .exceptions import GrammarError, LarkError, UnexpectedCharacters, UnexpectedToken +from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken from .tree import Tree, SlottedTree as ST from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive @@ -850,7 +850,7 @@ class GrammarLoader: if len(stmt.children) > 1: path_node, arg1 = stmt.children else: - path_node, = stmt.children + path_node ,= stmt.children arg1 = None if isinstance(arg1, Tree): # Multi import