diff --git a/lark/lexer.py b/lark/lexer.py index 3dabd95..6fb6572 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -9,7 +9,7 @@ class LexError(Exception): pass class UnexpectedInput(LexError): - def __init__(self, seq, lex_pos, line, column): + def __init__(self, seq, lex_pos, line, column, allowed=None): context = seq[lex_pos:lex_pos+5] message = "No token defined for: '%s' in %r at line %d" % (seq[lex_pos], context, line) @@ -18,6 +18,7 @@ class UnexpectedInput(LexError): self.line = line self.column = column self.context = context + self.allowed = allowed class Token(Str): def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None): @@ -238,7 +239,6 @@ class ContextualLexer: break else: if lex_pos < len(stream): - print("Allowed tokens:", lexer.tokens) - raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) + raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos, lexer.tokens) break diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 9bf6f95..8762176 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -290,6 +290,31 @@ class ExtractAnonTokens(InlineTransformer): def _rfind(s, choices): return max(s.rfind(c) for c in choices) + + +def _fix_escaping(s): + s = s.replace('\\"', '"') + w = '' + i = iter(s) + for n in i: + w += n + if n == '\\': + n2 = next(i) + if n2 == '\\': + w += '\\\\' + elif n2 not in 'unftr': + w += '\\' + w += n2 + + to_eval = "u'''%s'''" % w + try: + s = literal_eval(to_eval) + except SyntaxError as e: + raise ValueError(v, e) + + return s + + def _literal_to_pattern(literal): v = literal.value flag_start = _rfind(v, '/"')+1 @@ -300,13 +325,12 @@ def _literal_to_pattern(literal): v = v[:flag_start] assert v[0] == v[-1] and v[0] in '"/' x = v[1:-1] - x = re.sub(r'(\\[wd/ .]|\\\[|\\\])', r'\\\1', x) - x = x.replace("'", r"\'") - to_eval = "u'''%s'''" % x - try: - s = literal_eval(to_eval) - except SyntaxError as e: - raise ValueError(v, e) + + s = _fix_escaping(x) + + if v[0] == '"': + s = s.replace('\\\\', '\\') + return { 'STRING': PatternStr, 'REGEXP': PatternRE }[literal.type](s, flags or None) diff --git a/tests/test_parser.py b/tests/test_parser.py index 33b9e5e..35c1a44 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -19,7 +19,7 @@ logging.basicConfig(level=logging.INFO) from lark.lark import Lark from lark.common import GrammarError, ParseError -from lark.lexer import LexError +from lark.lexer import LexError, UnexpectedInput from lark.tree import Tree, Transformer __path__ = os.path.dirname(__file__) @@ -673,7 +673,7 @@ def _make_parser_test(LEXER, PARSER): """) x = g.parse(r'\a') - g = _Lark(r"""start: /\\\\/ /a/ + g = _Lark(r"""start: /\\/ /a/ """) x = g.parse(r'\a') @@ -961,6 +961,49 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(tree.children, ['1']) + @unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions") + def test_regex_escaping(self): + expected_error = ParseError if LEXER == 'dynamic' else UnexpectedInput + # TODO Make dynamic parser raise UnexpectedInput if nothing scans? + + g = _Lark("start: /[ab]/") + g.parse('a') + g.parse('b') + + self.assertRaises( expected_error, g.parse, 'c') + + _Lark(r'start: /\w/').parse('a') + + g = _Lark(r'start: /\\w/') + self.assertRaises( expected_error, g.parse, 'a') + g.parse(r'\w') + + _Lark(r'start: /\[/').parse('[') + + _Lark(r'start: /\//').parse('/') + + _Lark(r'start: /\\/').parse('\\') + + _Lark(r'start: /\[ab]/').parse('[ab]') + + _Lark(r'start: /\\[ab]/').parse('\\a') + + _Lark(r'start: /\t/').parse('\t') + + _Lark(r'start: /\\t/').parse('\\t') + + _Lark(r'start: /\\\t/').parse('\\\t') + + _Lark(r'start: "\t"').parse('\t') + + _Lark(r'start: "\\t"').parse('\\t') + + _Lark(r'start: "\\\t"').parse('\\\t') + + + + + _NAME = "Test" + PARSER.capitalize() + (LEXER or 'Scanless').capitalize() _TestParser.__name__ = _NAME