Merge branch 'regex' of https://github.com/julienmalard/lark into julienmalard-regex

5 years ago · 1ef0e1832e
--- a/lark-stubs/lexer.pyi
+++ b/lark-stubs/lexer.pyi
@@ -107,6 +107,7 @@ class TraditionalLexer(Lexer):
    user_callbacks: Dict[str, _Callback]
    callback: Dict[str, _Callback]
    mres: List[Tuple[REPattern, Dict[int, str]]]
    re: ModuleType

    def __init__(
        self,
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -247,13 +247,13 @@ def _create_unless(terminals, g_regex_flags, re_):
                if strtok.pattern.flags <= retok.pattern.flags:
                    embedded_strs.add(strtok)
        if unless:
            callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, match_whole=True))
            callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True))

    terminals = [t for t in terminals if t not in embedded_strs]
    return terminals, callback


 def _build_mres(terminals, max_size, g_regex_flags, match_whole):
 def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_):
    # Python sets an unreasonable group limit (currently 100) in its re module
    # Worse, the only way to know we reached it is by catching an AssertionError!
    # This function recursively tries less and less groups until it's successful.
@@ -261,17 +261,17 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole):
    mres = []
    while terminals:
        try:
            mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags)
            mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags)
        except AssertionError:  # Yes, this is what Python provides us.. :/
            return _build_mres(terminals, max_size//2, g_regex_flags, match_whole)
            return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_)

        # terms_from_name = {t.name: t for t in terminals[:max_size]}
        mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
        terminals = terminals[max_size:]
    return mres

 def build_mres(terminals, g_regex_flags, match_whole=False):
    return _build_mres(terminals, len(terminals), g_regex_flags, match_whole)
 def build_mres(terminals, g_regex_flags, re_, match_whole=False):
    return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_)

 def _regexp_has_newline(r):
    r"""Expressions that may indicate newlines in a regexp:
@@ -332,7 +332,7 @@ class TraditionalLexer(Lexer):
            else:
                self.callback[type_] = f

        self.mres = build_mres(terminals, g_regex_flags)
        self.mres = build_mres(terminals, g_regex_flags, self.re)

    def match(self, stream, pos):
        for mre, type_from_index in self.mres:
--- a/tests/main.py
+++ b/tests/main.py
@@ -7,7 +7,7 @@ from .test_trees import TestTrees
 from .test_tools import TestStandalone
 from .test_cache import TestCache
 from .test_reconstructor import TestReconstructor
 from .test_regex import TestRegex

 try:
    from .test_nearley.test_nearley import TestNearley
 except ImportError:
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -1787,6 +1787,23 @@ def _make_parser_test(LEXER, PARSER):
                self.assertEqual(a.line, 1)
                self.assertEqual(b.line, 2)

        @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
        def test_unicode_class(self):
            "Tests that character classes from the `regex` module work correctly."
            g = _Lark(r"""?start: NAME
                           NAME: ID_START ID_CONTINUE*
                           ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/
                           ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}]+/""", regex=True)

            self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')

        @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
        def test_unicode_word(self):
            "Tests that a persistent bug in the `re` module works when `regex` is enabled."
            g = _Lark(r"""?start: NAME
                           NAME: /[\w]+/
                        """, regex=True)
            self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')

    _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
    _TestParser.__name__ = _NAME
--- a/tests/test_regex.py
+++ b/tests/test_regex.py
@@ -1,37 +0,0 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import

 import logging
 import sys
 import unittest

 logging.basicConfig(level=logging.INFO)

 from lark.lark import Lark


 class TestRegex(unittest.TestCase):
    @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
    def test_unicode_class(self):
        "Tests that character classes from the `regex` module work correctly."
        g = Lark(r"""
                    ?start: NAME
                    NAME: ID_START ID_CONTINUE*
                    ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/
                    ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/
                """, regex=True)

        self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')

    @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
    def test_unicode_word(self):
        "Tests that a persistent bug in the `re` module works when `regex` is enabled."
        g = Lark(r"""
                    ?start: NAME
                    NAME: /[\w]+/
                """, regex=True)
        self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')


 if __name__ == '__main__':
    unittest.main()
--- a/tox.ini
+++ b/tox.ini
@@ -14,11 +14,7 @@ pypy3 = pypy3
 [testenv]
 whitelist_externals = git
 deps =
    -rnearley-requirements.txt
    -rregex-requirements.txt

 # For regex testing
 extras = regex
    -rtest-requirements.txt

 # to always force recreation and avoid unexpected side effects
 recreate=True