From 3fc97331881489d9320bd83eef25d66379d241dc Mon Sep 17 00:00:00 2001
From: julienmalard <julien.malard@mail.mcgill.ca>
Date: Thu, 14 May 2020 14:36:55 -0400
Subject: [PATCH 01/12] Added regex module option.

---
 lark/lexer.py              | 5 ++++-
 lark/parser_frontends.py   | 5 ++++-
 setup.py                   | 5 ++++-
 tests/test_nearley/nearley | 2 +-
 tests/test_parser.py       | 5 ++++-
 5 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/lark/lexer.py b/lark/lexer.py
index 32bfe78..36541d1 100644
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -1,6 +1,9 @@
 ## Lexer Implementation
 
-import re
+try:
+    import regex as re
+except ImportError:
+    import re
 
 from .utils import Str, classify, get_regexp_width, Py36, Serialize
 from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken
diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
index d68d186..9f80ed4 100644
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -1,4 +1,7 @@
-import re
+try:
+    import regex as re
+except ImportError:
+    import re
 from functools import partial
 
 from .utils import get_regexp_width, Serialize
diff --git a/setup.py b/setup.py
index b962b7f..d31e4d2 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,7 @@
-import re
+try:
+    import regex as re
+except ImportError:
+    import re
 from setuptools import find_packages, setup
 
 __version__ ,= re.findall('__version__ = "(.*)"', open('lark/__init__.py').read())
diff --git a/tests/test_nearley/nearley b/tests/test_nearley/nearley
index a46b374..cf8925f 160000
--- a/tests/test_nearley/nearley
+++ b/tests/test_nearley/nearley
@@ -1 +1 @@
-Subproject commit a46b37471db486db0f6e1ce6a2934fb238346b44
+Subproject commit cf8925f729bde741a3076c5856c0c0862bc7f5de
diff --git a/tests/test_parser.py b/tests/test_parser.py
index fcb6d22..c6f420e 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -1,7 +1,10 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import
 
-import re
+try:
+    import regex as re
+except ImportError:
+    import re
 import unittest
 import logging
 import os

From eeafdb954b2f4de71062bb44b06a6968e0921781 Mon Sep 17 00:00:00 2001
From: julienmalard <julien.malard@mail.mcgill.ca>
Date: Fri, 15 May 2020 17:11:23 -0400
Subject: [PATCH 02/12] Added preliminary tests.

---
 regex-requirements.txt |  1 +
 tests/test_regex.py    | 34 ++++++++++++++++++++++++++++++++++
 tox.ini                |  1 +
 3 files changed, 36 insertions(+)
 create mode 100644 regex-requirements.txt
 create mode 100644 tests/test_regex.py

diff --git a/regex-requirements.txt b/regex-requirements.txt
new file mode 100644
index 0000000..822e14a
--- /dev/null
+++ b/regex-requirements.txt
@@ -0,0 +1 @@
+regex
\ No newline at end of file
diff --git a/tests/test_regex.py b/tests/test_regex.py
new file mode 100644
index 0000000..db0bb85
--- /dev/null
+++ b/tests/test_regex.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+import logging
+import unittest
+
+logging.basicConfig(level=logging.INFO)
+
+from lark.lark import Lark
+
+
+class TestRegex(unittest.TestCase):
+    def test_unicode_class(self):
+        "Tests that character classes from the `regex` module work correctly."
+        g = Lark(r"""
+                    ?start: NAME
+                    NAME: ID_START ID_CONTINUE*
+                    ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/
+                    ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/
+                """)
+
+        self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')
+
+    def test_unicode_word(self):
+        "Tests that a persistent bug in the `re` module works when `regex` is enabled."
+        g = Lark(r"""
+                    ?start: NAME
+                    NAME: /[\w]+/
+                """)
+        self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tox.ini b/tox.ini
index f0f311e..5427f0f 100644
--- a/tox.ini
+++ b/tox.ini
@@ -15,6 +15,7 @@ pypy3 = pypy3
 whitelist_externals = git
 deps =
     -rnearley-requirements.txt
+    -rregex-requirements.txt
 
 # to always force recreation and avoid unexpected side effects
 recreate=True

From 382489e020975f2d12b5f636ab6d76cb248d0cd1 Mon Sep 17 00:00:00 2001
From: julienmalard <julien.malard@mail.mcgill.ca>
Date: Wed, 10 Jun 2020 09:53:24 -0400
Subject: [PATCH 03/12] All tests pass now (local testing)

---
 lark/utils.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/lark/utils.py b/lark/utils.py
index 199071c..5ed662b 100644
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -165,14 +165,29 @@ def smart_decorator(f, create_decorator):
     else:
         return create_decorator(f.__func__.__call__, True)
 
+try:
+    import regex
+except ImportError:
+    regex = None
+
 import sys, re
 Py36 = (sys.version_info[:2] >= (3, 6))
 
 import sre_parse
 import sre_constants
+categ_pattern = re.compile(r'\\p{[A-Za-z_]+}')
 def get_regexp_width(regexp):
+    if regex:
+        # Since `sre_parse` cannot deal with Unicode categories of the form `\p{Mn}`, we replace these with
+        # a simple letter, which makes no difference as we are only trying to get the possible lengths of the regex
+        # match here below.
+        regexp_final = re.sub(categ_pattern, 'A', regexp)
+    else:
+        if re.search(categ_pattern, regexp):
+            raise ImportError('`regex` module must be installed in order to use Unicode categories.', regexp)
+        regexp_final = regexp
     try:
-        return [int(x) for x in sre_parse.parse(regexp).getwidth()]
+        return [int(x) for x in sre_parse.parse(regexp_final).getwidth()]
     except sre_constants.error:
         raise ValueError(regexp)
 
@@ -182,7 +197,7 @@ def get_regexp_width(regexp):
 def dedup_list(l):
     """Given a list (l) will removing duplicates from the list,
        preserving the original order of the list. Assumes that
-       the list entrie are hashable."""
+       the list entries are hashable."""
     dedup = set()
     return [ x for x in l if not (x in dedup or dedup.add(x))]
 

From 86a162d6d82522ab9f008b693e5418443f428ef5 Mon Sep 17 00:00:00 2001
From: julienmalard <julien.malard@mail.mcgill.ca>
Date: Fri, 26 Jun 2020 10:52:42 -0400
Subject: [PATCH 04/12] Added `regex` module as optional mode.

---
 lark-stubs/lark.pyi      |  2 ++
 lark/lark.py             | 25 ++++++++++++++++++++++---
 lark/lexer.py            | 25 ++++++++++++++-----------
 lark/load_grammar.py     | 14 ++++++++------
 lark/parser_frontends.py | 38 ++++++++++++++++++++------------------
 tests/test_parser.py     |  4 ++--
 tests/test_regex.py      |  4 ++--
 7 files changed, 70 insertions(+), 42 deletions(-)

diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi
index 8e5e3dd..511e0ad 100644
--- a/lark-stubs/lark.pyi
+++ b/lark-stubs/lark.pyi
@@ -23,6 +23,7 @@ class LarkOptions:
     transformer: Optional[Transformer]
     postlex: Optional[PostLex]
     ambiguity: str
+    regex: bool
     debug: bool
     keep_all_tokens: bool
     propagate_positions: bool
@@ -48,6 +49,7 @@ class Lark:
         transformer: Optional[Transformer] = None,
         postlex: Optional[PostLex] = None,
         ambiguity: Literal["explicit", "resolve"] = "resolve",
+        regex: bool = False,
         debug: bool = False,
         keep_all_tokens: bool = False,
         propagate_positions: bool = False,
diff --git a/lark/lark.py b/lark/lark.py
index 4497dd1..2c9dd42 100644
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -14,6 +14,12 @@ from .parse_tree_builder import ParseTreeBuilder
 from .parser_frontends import get_frontend
 from .grammar import Rule
 
+import re
+try:
+    import regex
+except ImportError:
+    regex = None
+
 ###{standalone
 
 class LarkOptions(Serialize):
@@ -34,6 +40,7 @@ class LarkOptions(Serialize):
                          When `False`,  `[]` behaves like the `?` operator,
                              and returns no value at all.
                          (default=`False`. Recommended to set to `True`)
+    regex - When True, uses the `regex` module instead of the stdlib `re`.
     cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading.
             LALR only for now.
         When `False`, does nothing (default)
@@ -92,6 +99,7 @@ class LarkOptions(Serialize):
         'start': 'start',
         'priority': 'auto',
         'ambiguity': 'auto',
+        'regex': False,
         'propagate_positions': False,
         'lexer_callbacks': {},
         'maybe_placeholders': False,
@@ -154,6 +162,16 @@ class Lark(Serialize):
 
         self.options = LarkOptions(options)
 
+        # Set regex or re module
+        use_regex = self.options.regex
+        if use_regex:
+            if regex:
+                self.re = regex
+            else:
+                raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.')
+        else:
+            self.re = re
+
         # Some, but not all file-like objects have a 'name' attribute
         try:
             self.source = grammar.name
@@ -224,7 +242,7 @@ class Lark(Serialize):
         assert self.options.ambiguity in ('resolve', 'explicit', 'auto', )
 
         # Parse the grammar file and compose the grammars (TODO)
-        self.grammar = load_grammar(grammar, self.source)
+        self.grammar = load_grammar(grammar, self.source, self.re)
 
         # Compile the EBNF grammar into BNF
         self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start)
@@ -285,7 +303,7 @@ class Lark(Serialize):
     def _build_parser(self):
         self._prepare_callbacks()
         parser_conf = ParserConf(self.rules, self._callbacks, self.options.start)
-        return self.parser_class(self.lexer_conf, parser_conf, options=self.options)
+        return self.parser_class(self.lexer_conf, parser_conf, self.re, options=self.options)
 
     def save(self, f):
         data, m = self.memo_serialize([TerminalDef, Rule])
@@ -312,10 +330,11 @@ class Lark(Serialize):
         if postlex is not None:
             options['postlex'] = postlex
         self.options = LarkOptions.deserialize(options, memo)
+        self.re = regex if self.options.regex else re
         self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
         self.source = '<deserialized>'
         self._prepare_callbacks()
-        self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex)
+        self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, self.re)
         return self
 
     @classmethod
diff --git a/lark/lexer.py b/lark/lexer.py
index 36541d1..4d5c498 100644
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -1,9 +1,10 @@
 ## Lexer Implementation
 
+import re
 try:
-    import regex as re
+    import regex
 except ImportError:
-    import re
+    regex = None
 
 from .utils import Str, classify, get_regexp_width, Py36, Serialize
 from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken
@@ -233,7 +234,7 @@ class CallChain:
 
 
 
-def _create_unless(terminals, g_regex_flags):
+def _create_unless(terminals, g_regex_flags, re_):
     tokens_by_type = classify(terminals, lambda t: type(t.pattern))
     assert len(tokens_by_type) <= 2, tokens_by_type.keys()
     embedded_strs = set()
@@ -244,7 +245,7 @@ def _create_unless(terminals, g_regex_flags):
             if strtok.priority > retok.priority:
                 continue
             s = strtok.pattern.value
-            m = re.match(retok.pattern.to_regexp(), s, g_regex_flags)
+            m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags)
             if m and m.group(0) == s:
                 unless.append(strtok)
                 if strtok.pattern.flags <= retok.pattern.flags:
@@ -297,16 +298,17 @@ class Lexer(object):
 
 class TraditionalLexer(Lexer):
 
-    def __init__(self, terminals, ignore=(), user_callbacks={}, g_regex_flags=0):
+    def __init__(self, terminals, re_, ignore=(), user_callbacks={}, g_regex_flags=0):
         assert all(isinstance(t, TerminalDef) for t in terminals), terminals
 
         terminals = list(terminals)
 
+        self.re = re_
         # Sanitization
         for t in terminals:
             try:
-                re.compile(t.pattern.to_regexp(), g_regex_flags)
-            except re.error:
+                self.re.compile(t.pattern.to_regexp(), g_regex_flags)
+            except self.re.error:
                 raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
 
             if t.pattern.min_width == 0:
@@ -324,7 +326,7 @@ class TraditionalLexer(Lexer):
         self.build(g_regex_flags)
 
     def build(self, g_regex_flags=0):
-        terminals, self.callback = _create_unless(self.terminals, g_regex_flags)
+        terminals, self.callback = _create_unless(self.terminals, g_regex_flags, re_=self.re)
         assert all(self.callback.values())
 
         for type_, f in self.user_callbacks.items():
@@ -350,7 +352,8 @@ class TraditionalLexer(Lexer):
 
 class ContextualLexer(Lexer):
 
-    def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0):
+    def __init__(self, terminals, states, re_, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0):
+        self.re = re_
         tokens_by_name = {}
         for t in terminals:
             assert t.name not in tokens_by_name, t
@@ -365,12 +368,12 @@ class ContextualLexer(Lexer):
             except KeyError:
                 accepts = set(accepts) | set(ignore) | set(always_accept)
                 state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
-                lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
+                lexer = TraditionalLexer(state_tokens, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
                 lexer_by_tokens[key] = lexer
 
             self.lexers[state] = lexer
 
-        self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
+        self.root_lexer = TraditionalLexer(terminals, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
 
     def lex(self, stream, get_parser_state):
         parser_state = get_parser_state()
diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index a4bef03..407d8d1 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -616,7 +616,7 @@ class Grammar:
 
 
 _imported_grammars = {}
-def import_grammar(grammar_path, base_paths=[]):
+def import_grammar(grammar_path, re_, base_paths=[]):
     if grammar_path not in _imported_grammars:
         import_paths = base_paths + IMPORT_PATHS
         for import_path in import_paths:
@@ -624,7 +624,7 @@ def import_grammar(grammar_path, base_paths=[]):
                 joined_path = os.path.join(import_path, grammar_path)
                 with open(joined_path, encoding='utf8') as f:
                     text = f.read()
-                grammar = load_grammar(text, joined_path)
+                grammar = load_grammar(text, joined_path, re_)
                 _imported_grammars[grammar_path] = grammar
                 break
         else:
@@ -755,7 +755,8 @@ def _find_used_symbols(tree):
               for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}
 
 class GrammarLoader:
-    def __init__(self):
+    def __init__(self, re_):
+        self.re = re_
         terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]
 
         rules = [options_from_rule(name, None, x) for name, x in  RULES.items()]
@@ -764,7 +765,7 @@ class GrammarLoader:
         lexer_conf = LexerConf(terminals, ['WS', 'COMMENT'])
 
         parser_conf = ParserConf(rules, callback, ['start'])
-        self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)
+        self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf, re_)
 
         self.canonize_tree = CanonizeTree()
 
@@ -862,7 +863,7 @@ class GrammarLoader:
         # import grammars
         for dotted_path, (base_paths, aliases) in imports.items():
             grammar_path = os.path.join(*dotted_path) + EXT
-            g = import_grammar(grammar_path, base_paths=base_paths)
+            g = import_grammar(grammar_path, self.re, base_paths=base_paths)
             new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases)
 
             term_defs += new_td
@@ -942,4 +943,5 @@ class GrammarLoader:
 
 
 
-load_grammar = GrammarLoader().load_grammar
+def load_grammar(grammar, source, re_):
+    return GrammarLoader(re_).load_grammar(grammar, source)
diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
index 9f80ed4..c453ab6 100644
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -1,7 +1,3 @@
-try:
-    import regex as re
-except ImportError:
-    import re
 from functools import partial
 
 from .utils import get_regexp_width, Serialize
@@ -66,14 +62,16 @@ class WithLexer(_ParserFrontend):
     __serialize_fields__ = 'parser', 'lexer_conf', 'start'
     __serialize_namespace__ = LexerConf,
 
-    def __init__(self, lexer_conf, parser_conf, options=None):
+    def __init__(self, lexer_conf, parser_conf, re_, options=None):
         self.lexer_conf = lexer_conf
         self.start = parser_conf.start
         self.postlex = lexer_conf.postlex
+        self.re = re_
 
     @classmethod
-    def deserialize(cls, data, memo, callbacks, postlex):
+    def deserialize(cls, data, memo, callbacks, postlex, re_):
         inst = super(WithLexer, cls).deserialize(data, memo)
+        inst.re = re_
         inst.postlex = postlex
         inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks)
         inst.init_lexer()
@@ -91,13 +89,14 @@ class WithLexer(_ParserFrontend):
         return self._parse(token_stream, start)
 
     def init_traditional_lexer(self):
-        self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags)
+        self.lexer = TraditionalLexer(self.lexer_conf.tokens, re_=self.re, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags)
 
 class LALR_WithLexer(WithLexer):
-    def __init__(self, lexer_conf, parser_conf, options=None):
+    def __init__(self, lexer_conf, parser_conf, re_, options=None):
         debug = options.debug if options else False
+        self.re = re_
         self.parser = LALR_Parser(parser_conf, debug=debug)
-        WithLexer.__init__(self, lexer_conf, parser_conf, options)
+        WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)
 
         self.init_lexer()
 
@@ -113,6 +112,7 @@ class LALR_ContextualLexer(LALR_WithLexer):
         states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
         always_accept = self.postlex.always_accept if self.postlex else ()
         self.lexer = ContextualLexer(self.lexer_conf.tokens, states,
+                                     re_=self.re,
                                      ignore=self.lexer_conf.ignore,
                                      always_accept=always_accept,
                                      user_callbacks=self.lexer_conf.callbacks,
@@ -129,11 +129,11 @@ class LALR_ContextualLexer(LALR_WithLexer):
 ###}
 
 class LALR_CustomLexer(LALR_WithLexer):
-    def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
-        self.lexer = lexer_cls(lexer_conf)
+    def __init__(self, lexer_cls, lexer_conf, parser_conf, re_, options=None):
+        self.lexer = lexer_cls(lexer_conf, re_=re_)
         debug = options.debug if options else False
         self.parser = LALR_Parser(parser_conf, debug=debug)
-        WithLexer.__init__(self, lexer_conf, parser_conf, options)
+        WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)
 
 
 def tokenize_text(text):
@@ -146,8 +146,8 @@ def tokenize_text(text):
         yield Token('CHAR', ch, line=line, column=i - col_start_pos)
 
 class Earley(WithLexer):
-    def __init__(self, lexer_conf, parser_conf, options=None):
-        WithLexer.__init__(self, lexer_conf, parser_conf, options)
+    def __init__(self, lexer_conf, parser_conf, re_, options=None):
+        WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)
         self.init_traditional_lexer()
 
         resolve_ambiguity = options.ambiguity == 'resolve'
@@ -159,7 +159,9 @@ class Earley(WithLexer):
 
 
 class XEarley(_ParserFrontend):
-    def __init__(self, lexer_conf, parser_conf, options=None, **kw):
+    def __init__(self, lexer_conf, parser_conf, re_, options=None, **kw):
+        self.re = re_
+
         self.token_by_name = {t.name:t for t in lexer_conf.tokens}
         self.start = parser_conf.start
 
@@ -191,7 +193,7 @@ class XEarley(_ParserFrontend):
                 if width == 0:
                     raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t)
 
-            self.regexps[t.name] = re.compile(regexp, lexer_conf.g_regex_flags)
+            self.regexps[t.name] = self.re.compile(regexp, lexer_conf.g_regex_flags)
 
     def parse(self, text, start):
         return self._parse(text, start)
@@ -204,8 +206,8 @@ class XEarley_CompleteLex(XEarley):
 
 class CYK(WithLexer):
 
-    def __init__(self, lexer_conf, parser_conf, options=None):
-        WithLexer.__init__(self, lexer_conf, parser_conf, options)
+    def __init__(self, lexer_conf, parser_conf, re_, options=None):
+        WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)
         self.init_traditional_lexer()
 
         self._analysis = GrammarAnalyzer(parser_conf)
diff --git a/tests/test_parser.py b/tests/test_parser.py
index c6f420e..f8f37df 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -551,8 +551,8 @@ class CustomLexer(Lexer):
     Purpose of this custom lexer is to test the integration,
     so it uses the traditionalparser as implementation without custom lexing behaviour.
     """
-    def __init__(self, lexer_conf):
-        self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags)
+    def __init__(self, lexer_conf, re_):
+        self.lexer = TraditionalLexer(lexer_conf.tokens, re_, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags)
     def lex(self, *args, **kwargs):
         return self.lexer.lex(*args, **kwargs)
 
diff --git a/tests/test_regex.py b/tests/test_regex.py
index db0bb85..6932a6b 100644
--- a/tests/test_regex.py
+++ b/tests/test_regex.py
@@ -17,7 +17,7 @@ class TestRegex(unittest.TestCase):
                     NAME: ID_START ID_CONTINUE*
                     ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/
                     ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/
-                """)
+                """, regex=True)
 
         self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')
 
@@ -26,7 +26,7 @@ class TestRegex(unittest.TestCase):
         g = Lark(r"""
                     ?start: NAME
                     NAME: /[\w]+/
-                """)
+                """, regex=True)
         self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')
 
 

From 857f71e3aaade4e9fed8f87e728dada22e1ef060 Mon Sep 17 00:00:00 2001
From: julienmalard <julien.malard@mail.mcgill.ca>
Date: Fri, 26 Jun 2020 11:12:05 -0400
Subject: [PATCH 05/12] Added regex tests to tox.

---
 tests/__main__.py   | 2 +-
 tests/test_regex.py | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/__main__.py b/tests/__main__.py
index cb26eb4..6b8f513 100644
--- a/tests/__main__.py
+++ b/tests/__main__.py
@@ -7,7 +7,7 @@ from .test_trees import TestTrees
 from .test_tools import TestStandalone
 from .test_cache import TestCache
 from .test_reconstructor import TestReconstructor
-
+from .test_regex import TestRegex
 try:
     from .test_nearley.test_nearley import TestNearley
 except ImportError:
diff --git a/tests/test_regex.py b/tests/test_regex.py
index 6932a6b..19f1923 100644
--- a/tests/test_regex.py
+++ b/tests/test_regex.py
@@ -2,6 +2,7 @@
 from __future__ import absolute_import
 
 import logging
+import sys
 import unittest
 
 logging.basicConfig(level=logging.INFO)
@@ -10,8 +11,10 @@ from lark.lark import Lark
 
 
 class TestRegex(unittest.TestCase):
+    @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
     def test_unicode_class(self):
         "Tests that character classes from the `regex` module work correctly."
+        print(sys.version_info)
         g = Lark(r"""
                     ?start: NAME
                     NAME: ID_START ID_CONTINUE*
@@ -21,6 +24,7 @@ class TestRegex(unittest.TestCase):
 
         self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')
 
+    @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
     def test_unicode_word(self):
         "Tests that a persistent bug in the `re` module works when `regex` is enabled."
         g = Lark(r"""

From 797195d8ad212e62a6c51fe5d767afdeeefa3ae9 Mon Sep 17 00:00:00 2001
From: julienmalard <julien.malard@mail.mcgill.ca>
Date: Fri, 26 Jun 2020 11:21:35 -0400
Subject: [PATCH 06/12] Removed debug print

---
 tests/test_regex.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_regex.py b/tests/test_regex.py
index 19f1923..d20a8bf 100644
--- a/tests/test_regex.py
+++ b/tests/test_regex.py
@@ -14,7 +14,6 @@ class TestRegex(unittest.TestCase):
     @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
     def test_unicode_class(self):
         "Tests that character classes from the `regex` module work correctly."
-        print(sys.version_info)
         g = Lark(r"""
                     ?start: NAME
                     NAME: ID_START ID_CONTINUE*

From 1465ac73537d5f42d4d977d5a8c5c91b9b9d51bc Mon Sep 17 00:00:00 2001
From: julienmalard <julien.malard@mail.mcgill.ca>
Date: Fri, 26 Jun 2020 11:21:51 -0400
Subject: [PATCH 07/12] Added `regex` extras dependency

---
 tox.ini | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tox.ini b/tox.ini
index 5427f0f..ee0c5dd 100644
--- a/tox.ini
+++ b/tox.ini
@@ -17,6 +17,9 @@ deps =
     -rnearley-requirements.txt
     -rregex-requirements.txt
 
+# For regex testing
+extras = regex
+
 # to always force recreation and avoid unexpected side effects
 recreate=True
 

From 959d05ad36a24186daa0cde887ea4325eff72d0a Mon Sep 17 00:00:00 2001
From: julienmalard <julien.malard@mail.mcgill.ca>
Date: Fri, 26 Jun 2020 11:27:43 -0400
Subject: [PATCH 08/12] Try with extras_require

---
 setup.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/setup.py b/setup.py
index d31e4d2..a3d2a97 100644
--- a/setup.py
+++ b/setup.py
@@ -14,6 +14,10 @@ setup(
     requires = [],
     install_requires = [],
 
+    extras_require = {
+        "regex": ["regex"]
+    },
+
     package_data = {'': ['*.md', '*.lark'], 'lark-stubs': ['*.pyi']},
 
     test_suite = 'tests.__main__',

From a163b344b3a8868c1eb0819faa12bda6ec7eb7c2 Mon Sep 17 00:00:00 2001
From: julienmalard <julien.malard@mail.mcgill.ca>
Date: Fri, 26 Jun 2020 11:32:23 -0400
Subject: [PATCH 09/12] Found it!

---
 .github/workflows/tests.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 7be3a92..f55b88c 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -23,6 +23,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install -r nearley-requirements.txt
+          pip install -r regex-requirements.txt
       - name: Run tests
         run: |
           python -m tests
\ No newline at end of file

From 5fe67b9fc4c8302534bef499a6ddc6b7c3344eac Mon Sep 17 00:00:00 2001
From: julienmalard <julien.malard@mail.mcgill.ca>
Date: Fri, 26 Jun 2020 11:35:46 -0400
Subject: [PATCH 10/12] Merged test requirements

---
 .github/workflows/tests.yml                       | 3 +--
 regex-requirements.txt                            | 1 -
 nearley-requirements.txt => test-requirements.txt | 1 +
 3 files changed, 2 insertions(+), 3 deletions(-)
 delete mode 100644 regex-requirements.txt
 rename nearley-requirements.txt => test-requirements.txt (70%)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index f55b88c..6d1e406 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -22,8 +22,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install -r nearley-requirements.txt
-          pip install -r regex-requirements.txt
+          pip install -r test-requirements.txt
       - name: Run tests
         run: |
           python -m tests
\ No newline at end of file
diff --git a/regex-requirements.txt b/regex-requirements.txt
deleted file mode 100644
index 822e14a..0000000
--- a/regex-requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-regex
\ No newline at end of file
diff --git a/nearley-requirements.txt b/test-requirements.txt
similarity index 70%
rename from nearley-requirements.txt
rename to test-requirements.txt
index 750c740..d304ee8 100644
--- a/nearley-requirements.txt
+++ b/test-requirements.txt
@@ -1 +1,2 @@
 Js2Py==0.68
+regex
\ No newline at end of file

From e22536fc9b70e1ec6a875f20754331826c3197fd Mon Sep 17 00:00:00 2001
From: julienmalard <julien.malard@mail.mcgill.ca>
Date: Fri, 26 Jun 2020 11:40:18 -0400
Subject: [PATCH 11/12] Updated stubs

---
 lark-stubs/lexer.pyi | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lark-stubs/lexer.pyi b/lark-stubs/lexer.pyi
index a43b754..1ae861d 100644
--- a/lark-stubs/lexer.pyi
+++ b/lark-stubs/lexer.pyi
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-
+from types import ModuleType
 from typing import (
     TypeVar, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional,
     Pattern as REPattern,
@@ -111,6 +111,7 @@ class TraditionalLexer(Lexer):
     def __init__(
         self,
         terminals: Collection[TerminalDef],
+        re_: ModuleType,
         ignore: Collection[str] = ...,
         user_callbacks: Dict[str, _Callback] = ...,
         g_regex_flags: int = ...
@@ -135,6 +136,7 @@ class ContextualLexer(Lexer):
         self,
         terminals: Collection[TerminalDef],
         states: Dict[str, Collection[str]],
+        re_: ModuleType,
         ignore: Collection[str] = ...,
         always_accept: Collection[str] = ...,
         user_callbacks: Dict[str, _Callback] = ...,

From c319ace48d1b0edea506a5364fd04816480e84a7 Mon Sep 17 00:00:00 2001
From: julienmalard <julien.malard@mail.mcgill.ca>
Date: Fri, 26 Jun 2020 11:47:00 -0400
Subject: [PATCH 12/12] Update README.md

---
 README.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/README.md b/README.md
index 1c7062c..02b89d7 100644
--- a/README.md
+++ b/README.md
@@ -176,6 +176,27 @@ You can use the output as a regular python module:
 0.38981434460254655
 ```
 
+### Using Unicode character classes with `regex`
+Python's builtin `re` module has a few persistent known bugs and also won't parse
+advanced regex features such as character classes.
+With `pip install lark-parser[regex]`, the `regex` module will be installed alongside `lark`
+and can act as a drop-in replacement to `re`.
+
+Any instance of `Lark` instantiated with `regex=True` will now use the `regex` module
+instead of `re`. For example, we can now use character classes to match PEP-3131 compliant Python identifiers. 
+```python
+from lark import Lark
+>>> g = Lark(r"""
+                    ?start: NAME
+                    NAME: ID_START ID_CONTINUE*
+                    ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/
+                    ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/
+                """, regex=True)
+
+>>> g.parse('வணக்கம்') 
+'வணக்கம்'
+
+```
 
 ## License