| @@ -107,6 +107,7 @@ class TraditionalLexer(Lexer): | |||||
| user_callbacks: Dict[str, _Callback] | user_callbacks: Dict[str, _Callback] | ||||
| callback: Dict[str, _Callback] | callback: Dict[str, _Callback] | ||||
| mres: List[Tuple[REPattern, Dict[int, str]]] | mres: List[Tuple[REPattern, Dict[int, str]]] | ||||
| re: ModuleType | |||||
| def __init__( | def __init__( | ||||
| self, | self, | ||||
| @@ -247,13 +247,13 @@ def _create_unless(terminals, g_regex_flags, re_): | |||||
| if strtok.pattern.flags <= retok.pattern.flags: | if strtok.pattern.flags <= retok.pattern.flags: | ||||
| embedded_strs.add(strtok) | embedded_strs.add(strtok) | ||||
| if unless: | if unless: | ||||
| callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, match_whole=True)) | |||||
| callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True)) | |||||
| terminals = [t for t in terminals if t not in embedded_strs] | terminals = [t for t in terminals if t not in embedded_strs] | ||||
| return terminals, callback | return terminals, callback | ||||
| def _build_mres(terminals, max_size, g_regex_flags, match_whole): | |||||
| def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_): | |||||
| # Python sets an unreasonable group limit (currently 100) in its re module | # Python sets an unreasonable group limit (currently 100) in its re module | ||||
| # Worse, the only way to know we reached it is by catching an AssertionError! | # Worse, the only way to know we reached it is by catching an AssertionError! | ||||
| # This function recursively tries less and less groups until it's successful. | # This function recursively tries less and less groups until it's successful. | ||||
| @@ -261,17 +261,17 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole): | |||||
| mres = [] | mres = [] | ||||
| while terminals: | while terminals: | ||||
| try: | try: | ||||
| mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags) | |||||
| mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags) | |||||
| except AssertionError: # Yes, this is what Python provides us.. :/ | except AssertionError: # Yes, this is what Python provides us.. :/ | ||||
| return _build_mres(terminals, max_size//2, g_regex_flags, match_whole) | |||||
| return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_) | |||||
| # terms_from_name = {t.name: t for t in terminals[:max_size]} | # terms_from_name = {t.name: t for t in terminals[:max_size]} | ||||
| mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) | mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) | ||||
| terminals = terminals[max_size:] | terminals = terminals[max_size:] | ||||
| return mres | return mres | ||||
| def build_mres(terminals, g_regex_flags, match_whole=False): | |||||
| return _build_mres(terminals, len(terminals), g_regex_flags, match_whole) | |||||
| def build_mres(terminals, g_regex_flags, re_, match_whole=False): | |||||
| return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_) | |||||
| def _regexp_has_newline(r): | def _regexp_has_newline(r): | ||||
| r"""Expressions that may indicate newlines in a regexp: | r"""Expressions that may indicate newlines in a regexp: | ||||
| @@ -332,7 +332,7 @@ class TraditionalLexer(Lexer): | |||||
| else: | else: | ||||
| self.callback[type_] = f | self.callback[type_] = f | ||||
| self.mres = build_mres(terminals, g_regex_flags) | |||||
| self.mres = build_mres(terminals, g_regex_flags, self.re) | |||||
| def match(self, stream, pos): | def match(self, stream, pos): | ||||
| for mre, type_from_index in self.mres: | for mre, type_from_index in self.mres: | ||||
| @@ -7,7 +7,7 @@ from .test_trees import TestTrees | |||||
| from .test_tools import TestStandalone | from .test_tools import TestStandalone | ||||
| from .test_cache import TestCache | from .test_cache import TestCache | ||||
| from .test_reconstructor import TestReconstructor | from .test_reconstructor import TestReconstructor | ||||
| from .test_regex import TestRegex | |||||
| try: | try: | ||||
| from .test_nearley.test_nearley import TestNearley | from .test_nearley.test_nearley import TestNearley | ||||
| except ImportError: | except ImportError: | ||||
| @@ -1787,6 +1787,23 @@ def _make_parser_test(LEXER, PARSER): | |||||
| self.assertEqual(a.line, 1) | self.assertEqual(a.line, 1) | ||||
| self.assertEqual(b.line, 2) | self.assertEqual(b.line, 2) | ||||
| @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') | |||||
| def test_unicode_class(self): | |||||
| "Tests that character classes from the `regex` module work correctly." | |||||
| g = _Lark(r"""?start: NAME | |||||
| NAME: ID_START ID_CONTINUE* | |||||
| ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ | |||||
| ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}]+/""", regex=True) | |||||
| self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | |||||
| @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') | |||||
| def test_unicode_word(self): | |||||
| "Tests that a persistent bug in the `re` module works when `regex` is enabled." | |||||
| g = _Lark(r"""?start: NAME | |||||
| NAME: /[\w]+/ | |||||
| """, regex=True) | |||||
| self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | |||||
| _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | ||||
| _TestParser.__name__ = _NAME | _TestParser.__name__ = _NAME | ||||
| @@ -1,37 +0,0 @@ | |||||
| # -*- coding: utf-8 -*- | |||||
| from __future__ import absolute_import | |||||
| import logging | |||||
| import sys | |||||
| import unittest | |||||
| logging.basicConfig(level=logging.INFO) | |||||
| from lark.lark import Lark | |||||
| class TestRegex(unittest.TestCase): | |||||
| @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') | |||||
| def test_unicode_class(self): | |||||
| "Tests that character classes from the `regex` module work correctly." | |||||
| g = Lark(r""" | |||||
| ?start: NAME | |||||
| NAME: ID_START ID_CONTINUE* | |||||
| ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ | |||||
| ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ | |||||
| """, regex=True) | |||||
| self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | |||||
| @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') | |||||
| def test_unicode_word(self): | |||||
| "Tests that a persistent bug in the `re` module works when `regex` is enabled." | |||||
| g = Lark(r""" | |||||
| ?start: NAME | |||||
| NAME: /[\w]+/ | |||||
| """, regex=True) | |||||
| self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | |||||
| if __name__ == '__main__': | |||||
| unittest.main() | |||||
| @@ -14,11 +14,7 @@ pypy3 = pypy3 | |||||
| [testenv] | [testenv] | ||||
| whitelist_externals = git | whitelist_externals = git | ||||
| deps = | deps = | ||||
| -rnearley-requirements.txt | |||||
| -rregex-requirements.txt | |||||
| # For regex testing | |||||
| extras = regex | |||||
| -rtest-requirements.txt | |||||
| # to always force recreation and avoid unexpected side effects | # to always force recreation and avoid unexpected side effects | ||||
| recreate=True | recreate=True | ||||