@@ -107,6 +107,7 @@ class TraditionalLexer(Lexer): | |||||
user_callbacks: Dict[str, _Callback] | user_callbacks: Dict[str, _Callback] | ||||
callback: Dict[str, _Callback] | callback: Dict[str, _Callback] | ||||
mres: List[Tuple[REPattern, Dict[int, str]]] | mres: List[Tuple[REPattern, Dict[int, str]]] | ||||
re: ModuleType | |||||
def __init__( | def __init__( | ||||
self, | self, | ||||
@@ -251,13 +251,13 @@ def _create_unless(terminals, g_regex_flags, re_): | |||||
if strtok.pattern.flags <= retok.pattern.flags: | if strtok.pattern.flags <= retok.pattern.flags: | ||||
embedded_strs.add(strtok) | embedded_strs.add(strtok) | ||||
if unless: | if unless: | ||||
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, match_whole=True)) | |||||
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True)) | |||||
terminals = [t for t in terminals if t not in embedded_strs] | terminals = [t for t in terminals if t not in embedded_strs] | ||||
return terminals, callback | return terminals, callback | ||||
def _build_mres(terminals, max_size, g_regex_flags, match_whole): | |||||
def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_): | |||||
# Python sets an unreasonable group limit (currently 100) in its re module | # Python sets an unreasonable group limit (currently 100) in its re module | ||||
# Worse, the only way to know we reached it is by catching an AssertionError! | # Worse, the only way to know we reached it is by catching an AssertionError! | ||||
# This function recursively tries less and less groups until it's successful. | # This function recursively tries less and less groups until it's successful. | ||||
@@ -265,17 +265,17 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole): | |||||
mres = [] | mres = [] | ||||
while terminals: | while terminals: | ||||
try: | try: | ||||
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags) | |||||
mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags) | |||||
except AssertionError: # Yes, this is what Python provides us.. :/ | except AssertionError: # Yes, this is what Python provides us.. :/ | ||||
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole) | |||||
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_) | |||||
# terms_from_name = {t.name: t for t in terminals[:max_size]} | # terms_from_name = {t.name: t for t in terminals[:max_size]} | ||||
mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) | mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) | ||||
terminals = terminals[max_size:] | terminals = terminals[max_size:] | ||||
return mres | return mres | ||||
def build_mres(terminals, g_regex_flags, match_whole=False): | |||||
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole) | |||||
def build_mres(terminals, g_regex_flags, re_, match_whole=False): | |||||
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_) | |||||
def _regexp_has_newline(r): | def _regexp_has_newline(r): | ||||
r"""Expressions that may indicate newlines in a regexp: | r"""Expressions that may indicate newlines in a regexp: | ||||
@@ -336,7 +336,7 @@ class TraditionalLexer(Lexer): | |||||
else: | else: | ||||
self.callback[type_] = f | self.callback[type_] = f | ||||
self.mres = build_mres(terminals, g_regex_flags) | |||||
self.mres = build_mres(terminals, g_regex_flags, self.re) | |||||
def match(self, stream, pos): | def match(self, stream, pos): | ||||
for mre, type_from_index in self.mres: | for mre, type_from_index in self.mres: | ||||
@@ -7,7 +7,7 @@ from .test_trees import TestTrees | |||||
from .test_tools import TestStandalone | from .test_tools import TestStandalone | ||||
from .test_cache import TestCache | from .test_cache import TestCache | ||||
from .test_reconstructor import TestReconstructor | from .test_reconstructor import TestReconstructor | ||||
from .test_regex import TestRegex | |||||
try: | try: | ||||
from .test_nearley.test_nearley import TestNearley | from .test_nearley.test_nearley import TestNearley | ||||
except ImportError: | except ImportError: | ||||
@@ -1787,6 +1787,26 @@ def _make_parser_test(LEXER, PARSER): | |||||
self.assertEqual(a.line, 1) | self.assertEqual(a.line, 1) | ||||
self.assertEqual(b.line, 2) | self.assertEqual(b.line, 2) | ||||
@unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') | |||||
def test_unicode_class(self): | |||||
"Tests that character classes from the `regex` module work correctly." | |||||
g = _Lark(r""" | |||||
?start: NAME | |||||
NAME: ID_START ID_CONTINUE* | |||||
ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ | |||||
ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ | |||||
""", regex=True) | |||||
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | |||||
@unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') | |||||
def test_unicode_word(self): | |||||
"Tests that a persistent bug in the `re` module works when `regex` is enabled." | |||||
g = _Lark(r""" | |||||
?start: NAME | |||||
NAME: /[\w]+/ | |||||
""", regex=True) | |||||
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | |||||
_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | ||||
_TestParser.__name__ = _NAME | _TestParser.__name__ = _NAME | ||||
@@ -1,37 +0,0 @@ | |||||
# -*- coding: utf-8 -*- | |||||
from __future__ import absolute_import | |||||
import logging | |||||
import sys | |||||
import unittest | |||||
logging.basicConfig(level=logging.INFO) | |||||
from lark.lark import Lark | |||||
class TestRegex(unittest.TestCase): | |||||
@unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') | |||||
def test_unicode_class(self): | |||||
"Tests that character classes from the `regex` module work correctly." | |||||
g = Lark(r""" | |||||
?start: NAME | |||||
NAME: ID_START ID_CONTINUE* | |||||
ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ | |||||
ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ | |||||
""", regex=True) | |||||
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | |||||
@unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') | |||||
def test_unicode_word(self): | |||||
"Tests that a persistent bug in the `re` module works when `regex` is enabled." | |||||
g = Lark(r""" | |||||
?start: NAME | |||||
NAME: /[\w]+/ | |||||
""", regex=True) | |||||
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | |||||
if __name__ == '__main__': | |||||
unittest.main() |