From 3fc97331881489d9320bd83eef25d66379d241dc Mon Sep 17 00:00:00 2001 From: julienmalard Date: Thu, 14 May 2020 14:36:55 -0400 Subject: [PATCH 01/12] Added regex module option. --- lark/lexer.py | 5 ++++- lark/parser_frontends.py | 5 ++++- setup.py | 5 ++++- tests/test_nearley/nearley | 2 +- tests/test_parser.py | 5 ++++- 5 files changed, 17 insertions(+), 5 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 32bfe78..36541d1 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -1,6 +1,9 @@ ## Lexer Implementation -import re +try: + import regex as re +except ImportError: + import re from .utils import Str, classify, get_regexp_width, Py36, Serialize from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index d68d186..9f80ed4 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,4 +1,7 @@ -import re +try: + import regex as re +except ImportError: + import re from functools import partial from .utils import get_regexp_width, Serialize diff --git a/setup.py b/setup.py index b962b7f..d31e4d2 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,7 @@ -import re +try: + import regex as re +except ImportError: + import re from setuptools import find_packages, setup __version__ ,= re.findall('__version__ = "(.*)"', open('lark/__init__.py').read()) diff --git a/tests/test_nearley/nearley b/tests/test_nearley/nearley index a46b374..cf8925f 160000 --- a/tests/test_nearley/nearley +++ b/tests/test_nearley/nearley @@ -1 +1 @@ -Subproject commit a46b37471db486db0f6e1ce6a2934fb238346b44 +Subproject commit cf8925f729bde741a3076c5856c0c0862bc7f5de diff --git a/tests/test_parser.py b/tests/test_parser.py index fcb6d22..c6f420e 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1,7 +1,10 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -import re +try: + import regex as re +except ImportError: + import re import unittest import logging import os From eeafdb954b2f4de71062bb44b06a6968e0921781 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 15 May 2020 17:11:23 -0400 Subject: [PATCH 02/12] Added preliminary tests. --- regex-requirements.txt | 1 + tests/test_regex.py | 34 ++++++++++++++++++++++++++++++++++ tox.ini | 1 + 3 files changed, 36 insertions(+) create mode 100644 regex-requirements.txt create mode 100644 tests/test_regex.py diff --git a/regex-requirements.txt b/regex-requirements.txt new file mode 100644 index 0000000..822e14a --- /dev/null +++ b/regex-requirements.txt @@ -0,0 +1 @@ +regex \ No newline at end of file diff --git a/tests/test_regex.py b/tests/test_regex.py new file mode 100644 index 0000000..db0bb85 --- /dev/null +++ b/tests/test_regex.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import + +import logging +import unittest + +logging.basicConfig(level=logging.INFO) + +from lark.lark import Lark + + +class TestRegex(unittest.TestCase): + def test_unicode_class(self): + "Tests that character classes from the `regex` module work correctly." + g = Lark(r""" + ?start: NAME + NAME: ID_START ID_CONTINUE* + ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ + ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ + """) + + self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') + + def test_unicode_word(self): + "Tests that a persistent bug in the `re` module works when `regex` is enabled." + g = Lark(r""" + ?start: NAME + NAME: /[\w]+/ + """) + self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') + + +if __name__ == '__main__': + unittest.main() diff --git a/tox.ini b/tox.ini index f0f311e..5427f0f 100644 --- a/tox.ini +++ b/tox.ini @@ -15,6 +15,7 @@ pypy3 = pypy3 whitelist_externals = git deps = -rnearley-requirements.txt + -rregex-requirements.txt # to always force recreation and avoid unexpected side effects recreate=True From 382489e020975f2d12b5f636ab6d76cb248d0cd1 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Wed, 10 Jun 2020 09:53:24 -0400 Subject: [PATCH 03/12] All tests pass now (local testing) --- lark/utils.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/lark/utils.py b/lark/utils.py index 199071c..5ed662b 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -165,14 +165,29 @@ def smart_decorator(f, create_decorator): else: return create_decorator(f.__func__.__call__, True) +try: + import regex +except ImportError: + regex = None + import sys, re Py36 = (sys.version_info[:2] >= (3, 6)) import sre_parse import sre_constants +categ_pattern = re.compile(r'\\p{[A-Za-z_]+}') def get_regexp_width(regexp): + if regex: + # Since `sre_parse` cannot deal with Unicode categories of the form `\p{Mn}`, we replace these with + # a simple letter, which makes no difference as we are only trying to get the possible lengths of the regex + # match here below. + regexp_final = re.sub(categ_pattern, 'A', regexp) + else: + if re.search(categ_pattern, regexp): + raise ImportError('`regex` module must be installed in order to use Unicode categories.', regexp) + regexp_final = regexp try: - return [int(x) for x in sre_parse.parse(regexp).getwidth()] + return [int(x) for x in sre_parse.parse(regexp_final).getwidth()] except sre_constants.error: raise ValueError(regexp) @@ -182,7 +197,7 @@ def get_regexp_width(regexp): def dedup_list(l): """Given a list (l) will removing duplicates from the list, preserving the original order of the list. Assumes that - the list entrie are hashable.""" + the list entries are hashable.""" dedup = set() return [ x for x in l if not (x in dedup or dedup.add(x))] From 86a162d6d82522ab9f008b693e5418443f428ef5 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 26 Jun 2020 10:52:42 -0400 Subject: [PATCH 04/12] Added `regex` module as optional mode. --- lark-stubs/lark.pyi | 2 ++ lark/lark.py | 25 ++++++++++++++++++++++--- lark/lexer.py | 25 ++++++++++++++----------- lark/load_grammar.py | 14 ++++++++------ lark/parser_frontends.py | 38 ++++++++++++++++++++------------------ tests/test_parser.py | 4 ++-- tests/test_regex.py | 4 ++-- 7 files changed, 70 insertions(+), 42 deletions(-) diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index 8e5e3dd..511e0ad 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -23,6 +23,7 @@ class LarkOptions: transformer: Optional[Transformer] postlex: Optional[PostLex] ambiguity: str + regex: bool debug: bool keep_all_tokens: bool propagate_positions: bool @@ -48,6 +49,7 @@ class Lark: transformer: Optional[Transformer] = None, postlex: Optional[PostLex] = None, ambiguity: Literal["explicit", "resolve"] = "resolve", + regex: bool = False, debug: bool = False, keep_all_tokens: bool = False, propagate_positions: bool = False, diff --git a/lark/lark.py b/lark/lark.py index 4497dd1..2c9dd42 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -14,6 +14,12 @@ from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import get_frontend from .grammar import Rule +import re +try: + import regex +except ImportError: + regex = None + ###{standalone class LarkOptions(Serialize): @@ -34,6 +40,7 @@ class LarkOptions(Serialize): When `False`, `[]` behaves like the `?` operator, and returns no value at all. (default=`False`. Recommended to set to `True`) + regex - When True, uses the `regex` module instead of the stdlib `re`. cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now. When `False`, does nothing (default) @@ -92,6 +99,7 @@ class LarkOptions(Serialize): 'start': 'start', 'priority': 'auto', 'ambiguity': 'auto', + 'regex': False, 'propagate_positions': False, 'lexer_callbacks': {}, 'maybe_placeholders': False, @@ -154,6 +162,16 @@ class Lark(Serialize): self.options = LarkOptions(options) + # Set regex or re module + use_regex = self.options.regex + if use_regex: + if regex: + self.re = regex + else: + raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.') + else: + self.re = re + # Some, but not all file-like objects have a 'name' attribute try: self.source = grammar.name @@ -224,7 +242,7 @@ class Lark(Serialize): assert self.options.ambiguity in ('resolve', 'explicit', 'auto', ) # Parse the grammar file and compose the grammars (TODO) - self.grammar = load_grammar(grammar, self.source) + self.grammar = load_grammar(grammar, self.source, self.re) # Compile the EBNF grammar into BNF self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) @@ -285,7 +303,7 @@ class Lark(Serialize): def _build_parser(self): self._prepare_callbacks() parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) - return self.parser_class(self.lexer_conf, parser_conf, options=self.options) + return self.parser_class(self.lexer_conf, parser_conf, self.re, options=self.options) def save(self, f): data, m = self.memo_serialize([TerminalDef, Rule]) @@ -312,10 +330,11 @@ class Lark(Serialize): if postlex is not None: options['postlex'] = postlex self.options = LarkOptions.deserialize(options, memo) + self.re = regex if self.options.regex else re self.rules = [Rule.deserialize(r, memo) for r in data['rules']] self.source = '' self._prepare_callbacks() - self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex) + self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, self.re) return self @classmethod diff --git a/lark/lexer.py b/lark/lexer.py index 36541d1..4d5c498 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -1,9 +1,10 @@ ## Lexer Implementation +import re try: - import regex as re + import regex except ImportError: - import re + regex = None from .utils import Str, classify, get_regexp_width, Py36, Serialize from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken @@ -233,7 +234,7 @@ class CallChain: -def _create_unless(terminals, g_regex_flags): +def _create_unless(terminals, g_regex_flags, re_): tokens_by_type = classify(terminals, lambda t: type(t.pattern)) assert len(tokens_by_type) <= 2, tokens_by_type.keys() embedded_strs = set() @@ -244,7 +245,7 @@ def _create_unless(terminals, g_regex_flags): if strtok.priority > retok.priority: continue s = strtok.pattern.value - m = re.match(retok.pattern.to_regexp(), s, g_regex_flags) + m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags) if m and m.group(0) == s: unless.append(strtok) if strtok.pattern.flags <= retok.pattern.flags: @@ -297,16 +298,17 @@ class Lexer(object): class TraditionalLexer(Lexer): - def __init__(self, terminals, ignore=(), user_callbacks={}, g_regex_flags=0): + def __init__(self, terminals, re_, ignore=(), user_callbacks={}, g_regex_flags=0): assert all(isinstance(t, TerminalDef) for t in terminals), terminals terminals = list(terminals) + self.re = re_ # Sanitization for t in terminals: try: - re.compile(t.pattern.to_regexp(), g_regex_flags) - except re.error: + self.re.compile(t.pattern.to_regexp(), g_regex_flags) + except self.re.error: raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) if t.pattern.min_width == 0: @@ -324,7 +326,7 @@ class TraditionalLexer(Lexer): self.build(g_regex_flags) def build(self, g_regex_flags=0): - terminals, self.callback = _create_unless(self.terminals, g_regex_flags) + terminals, self.callback = _create_unless(self.terminals, g_regex_flags, re_=self.re) assert all(self.callback.values()) for type_, f in self.user_callbacks.items(): @@ -350,7 +352,8 @@ class TraditionalLexer(Lexer): class ContextualLexer(Lexer): - def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0): + def __init__(self, terminals, states, re_, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0): + self.re = re_ tokens_by_name = {} for t in terminals: assert t.name not in tokens_by_name, t @@ -365,12 +368,12 @@ class ContextualLexer(Lexer): except KeyError: accepts = set(accepts) | set(ignore) | set(always_accept) state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] - lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) + lexer = TraditionalLexer(state_tokens, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) lexer_by_tokens[key] = lexer self.lexers[state] = lexer - self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) + self.root_lexer = TraditionalLexer(terminals, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) def lex(self, stream, get_parser_state): parser_state = get_parser_state() diff --git a/lark/load_grammar.py b/lark/load_grammar.py index a4bef03..407d8d1 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -616,7 +616,7 @@ class Grammar: _imported_grammars = {} -def import_grammar(grammar_path, base_paths=[]): +def import_grammar(grammar_path, re_, base_paths=[]): if grammar_path not in _imported_grammars: import_paths = base_paths + IMPORT_PATHS for import_path in import_paths: @@ -624,7 +624,7 @@ def import_grammar(grammar_path, base_paths=[]): joined_path = os.path.join(import_path, grammar_path) with open(joined_path, encoding='utf8') as f: text = f.read() - grammar = load_grammar(text, joined_path) + grammar = load_grammar(text, joined_path, re_) _imported_grammars[grammar_path] = grammar break else: @@ -755,7 +755,8 @@ def _find_used_symbols(tree): for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} class GrammarLoader: - def __init__(self): + def __init__(self, re_): + self.re = re_ terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] rules = [options_from_rule(name, None, x) for name, x in RULES.items()] @@ -764,7 +765,7 @@ class GrammarLoader: lexer_conf = LexerConf(terminals, ['WS', 'COMMENT']) parser_conf = ParserConf(rules, callback, ['start']) - self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf) + self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf, re_) self.canonize_tree = CanonizeTree() @@ -862,7 +863,7 @@ class GrammarLoader: # import grammars for dotted_path, (base_paths, aliases) in imports.items(): grammar_path = os.path.join(*dotted_path) + EXT - g = import_grammar(grammar_path, base_paths=base_paths) + g = import_grammar(grammar_path, self.re, base_paths=base_paths) new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases) term_defs += new_td @@ -942,4 +943,5 @@ class GrammarLoader: -load_grammar = GrammarLoader().load_grammar +def load_grammar(grammar, source, re_): + return GrammarLoader(re_).load_grammar(grammar, source) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 9f80ed4..c453ab6 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,7 +1,3 @@ -try: - import regex as re -except ImportError: - import re from functools import partial from .utils import get_regexp_width, Serialize @@ -66,14 +62,16 @@ class WithLexer(_ParserFrontend): __serialize_fields__ = 'parser', 'lexer_conf', 'start' __serialize_namespace__ = LexerConf, - def __init__(self, lexer_conf, parser_conf, options=None): + def __init__(self, lexer_conf, parser_conf, re_, options=None): self.lexer_conf = lexer_conf self.start = parser_conf.start self.postlex = lexer_conf.postlex + self.re = re_ @classmethod - def deserialize(cls, data, memo, callbacks, postlex): + def deserialize(cls, data, memo, callbacks, postlex, re_): inst = super(WithLexer, cls).deserialize(data, memo) + inst.re = re_ inst.postlex = postlex inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) inst.init_lexer() @@ -91,13 +89,14 @@ class WithLexer(_ParserFrontend): return self._parse(token_stream, start) def init_traditional_lexer(self): - self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags) + self.lexer = TraditionalLexer(self.lexer_conf.tokens, re_=self.re, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags) class LALR_WithLexer(WithLexer): - def __init__(self, lexer_conf, parser_conf, options=None): + def __init__(self, lexer_conf, parser_conf, re_, options=None): debug = options.debug if options else False + self.re = re_ self.parser = LALR_Parser(parser_conf, debug=debug) - WithLexer.__init__(self, lexer_conf, parser_conf, options) + WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) self.init_lexer() @@ -113,6 +112,7 @@ class LALR_ContextualLexer(LALR_WithLexer): states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} always_accept = self.postlex.always_accept if self.postlex else () self.lexer = ContextualLexer(self.lexer_conf.tokens, states, + re_=self.re, ignore=self.lexer_conf.ignore, always_accept=always_accept, user_callbacks=self.lexer_conf.callbacks, @@ -129,11 +129,11 @@ class LALR_ContextualLexer(LALR_WithLexer): ###} class LALR_CustomLexer(LALR_WithLexer): - def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): - self.lexer = lexer_cls(lexer_conf) + def __init__(self, lexer_cls, lexer_conf, parser_conf, re_, options=None): + self.lexer = lexer_cls(lexer_conf, re_=re_) debug = options.debug if options else False self.parser = LALR_Parser(parser_conf, debug=debug) - WithLexer.__init__(self, lexer_conf, parser_conf, options) + WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) def tokenize_text(text): @@ -146,8 +146,8 @@ def tokenize_text(text): yield Token('CHAR', ch, line=line, column=i - col_start_pos) class Earley(WithLexer): - def __init__(self, lexer_conf, parser_conf, options=None): - WithLexer.__init__(self, lexer_conf, parser_conf, options) + def __init__(self, lexer_conf, parser_conf, re_, options=None): + WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) self.init_traditional_lexer() resolve_ambiguity = options.ambiguity == 'resolve' @@ -159,7 +159,9 @@ class Earley(WithLexer): class XEarley(_ParserFrontend): - def __init__(self, lexer_conf, parser_conf, options=None, **kw): + def __init__(self, lexer_conf, parser_conf, re_, options=None, **kw): + self.re = re_ + self.token_by_name = {t.name:t for t in lexer_conf.tokens} self.start = parser_conf.start @@ -191,7 +193,7 @@ class XEarley(_ParserFrontend): if width == 0: raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t) - self.regexps[t.name] = re.compile(regexp, lexer_conf.g_regex_flags) + self.regexps[t.name] = self.re.compile(regexp, lexer_conf.g_regex_flags) def parse(self, text, start): return self._parse(text, start) @@ -204,8 +206,8 @@ class XEarley_CompleteLex(XEarley): class CYK(WithLexer): - def __init__(self, lexer_conf, parser_conf, options=None): - WithLexer.__init__(self, lexer_conf, parser_conf, options) + def __init__(self, lexer_conf, parser_conf, re_, options=None): + WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) self.init_traditional_lexer() self._analysis = GrammarAnalyzer(parser_conf) diff --git a/tests/test_parser.py b/tests/test_parser.py index c6f420e..f8f37df 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -551,8 +551,8 @@ class CustomLexer(Lexer): Purpose of this custom lexer is to test the integration, so it uses the traditionalparser as implementation without custom lexing behaviour. """ - def __init__(self, lexer_conf): - self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags) + def __init__(self, lexer_conf, re_): + self.lexer = TraditionalLexer(lexer_conf.tokens, re_, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags) def lex(self, *args, **kwargs): return self.lexer.lex(*args, **kwargs) diff --git a/tests/test_regex.py b/tests/test_regex.py index db0bb85..6932a6b 100644 --- a/tests/test_regex.py +++ b/tests/test_regex.py @@ -17,7 +17,7 @@ class TestRegex(unittest.TestCase): NAME: ID_START ID_CONTINUE* ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ - """) + """, regex=True) self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') @@ -26,7 +26,7 @@ class TestRegex(unittest.TestCase): g = Lark(r""" ?start: NAME NAME: /[\w]+/ - """) + """, regex=True) self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') From 857f71e3aaade4e9fed8f87e728dada22e1ef060 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 26 Jun 2020 11:12:05 -0400 Subject: [PATCH 05/12] Added regex tests to tox. --- tests/__main__.py | 2 +- tests/test_regex.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/__main__.py b/tests/__main__.py index cb26eb4..6b8f513 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -7,7 +7,7 @@ from .test_trees import TestTrees from .test_tools import TestStandalone from .test_cache import TestCache from .test_reconstructor import TestReconstructor - +from .test_regex import TestRegex try: from .test_nearley.test_nearley import TestNearley except ImportError: diff --git a/tests/test_regex.py b/tests/test_regex.py index 6932a6b..19f1923 100644 --- a/tests/test_regex.py +++ b/tests/test_regex.py @@ -2,6 +2,7 @@ from __future__ import absolute_import import logging +import sys import unittest logging.basicConfig(level=logging.INFO) @@ -10,8 +11,10 @@ from lark.lark import Lark class TestRegex(unittest.TestCase): + @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') def test_unicode_class(self): "Tests that character classes from the `regex` module work correctly." + print(sys.version_info) g = Lark(r""" ?start: NAME NAME: ID_START ID_CONTINUE* @@ -21,6 +24,7 @@ class TestRegex(unittest.TestCase): self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') + @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') def test_unicode_word(self): "Tests that a persistent bug in the `re` module works when `regex` is enabled." g = Lark(r""" From 797195d8ad212e62a6c51fe5d767afdeeefa3ae9 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 26 Jun 2020 11:21:35 -0400 Subject: [PATCH 06/12] Removed debug print --- tests/test_regex.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_regex.py b/tests/test_regex.py index 19f1923..d20a8bf 100644 --- a/tests/test_regex.py +++ b/tests/test_regex.py @@ -14,7 +14,6 @@ class TestRegex(unittest.TestCase): @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') def test_unicode_class(self): "Tests that character classes from the `regex` module work correctly." - print(sys.version_info) g = Lark(r""" ?start: NAME NAME: ID_START ID_CONTINUE* From 1465ac73537d5f42d4d977d5a8c5c91b9b9d51bc Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 26 Jun 2020 11:21:51 -0400 Subject: [PATCH 07/12] Added `regex` extras dependency --- tox.ini | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tox.ini b/tox.ini index 5427f0f..ee0c5dd 100644 --- a/tox.ini +++ b/tox.ini @@ -17,6 +17,9 @@ deps = -rnearley-requirements.txt -rregex-requirements.txt +# For regex testing +extras = regex + # to always force recreation and avoid unexpected side effects recreate=True From 959d05ad36a24186daa0cde887ea4325eff72d0a Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 26 Jun 2020 11:27:43 -0400 Subject: [PATCH 08/12] Try with extras_require --- setup.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/setup.py b/setup.py index d31e4d2..a3d2a97 100644 --- a/setup.py +++ b/setup.py @@ -14,6 +14,10 @@ setup( requires = [], install_requires = [], + extras_require = { + "regex": ["regex"] + }, + package_data = {'': ['*.md', '*.lark'], 'lark-stubs': ['*.pyi']}, test_suite = 'tests.__main__', From a163b344b3a8868c1eb0819faa12bda6ec7eb7c2 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 26 Jun 2020 11:32:23 -0400 Subject: [PATCH 09/12] Found it! --- .github/workflows/tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7be3a92..f55b88c 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -23,6 +23,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -r nearley-requirements.txt + pip install -r regex-requirements.txt - name: Run tests run: | python -m tests \ No newline at end of file From 5fe67b9fc4c8302534bef499a6ddc6b7c3344eac Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 26 Jun 2020 11:35:46 -0400 Subject: [PATCH 10/12] Merged test requirements --- .github/workflows/tests.yml | 3 +-- regex-requirements.txt | 1 - nearley-requirements.txt => test-requirements.txt | 1 + 3 files changed, 2 insertions(+), 3 deletions(-) delete mode 100644 regex-requirements.txt rename nearley-requirements.txt => test-requirements.txt (70%) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f55b88c..6d1e406 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -22,8 +22,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -r nearley-requirements.txt - pip install -r regex-requirements.txt + pip install -r test-requirements.txt - name: Run tests run: | python -m tests \ No newline at end of file diff --git a/regex-requirements.txt b/regex-requirements.txt deleted file mode 100644 index 822e14a..0000000 --- a/regex-requirements.txt +++ /dev/null @@ -1 +0,0 @@ -regex \ No newline at end of file diff --git a/nearley-requirements.txt b/test-requirements.txt similarity index 70% rename from nearley-requirements.txt rename to test-requirements.txt index 750c740..d304ee8 100644 --- a/nearley-requirements.txt +++ b/test-requirements.txt @@ -1 +1,2 @@ Js2Py==0.68 +regex \ No newline at end of file From e22536fc9b70e1ec6a875f20754331826c3197fd Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 26 Jun 2020 11:40:18 -0400 Subject: [PATCH 11/12] Updated stubs --- lark-stubs/lexer.pyi | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lark-stubs/lexer.pyi b/lark-stubs/lexer.pyi index a43b754..1ae861d 100644 --- a/lark-stubs/lexer.pyi +++ b/lark-stubs/lexer.pyi @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- - +from types import ModuleType from typing import ( TypeVar, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, Pattern as REPattern, @@ -111,6 +111,7 @@ class TraditionalLexer(Lexer): def __init__( self, terminals: Collection[TerminalDef], + re_: ModuleType, ignore: Collection[str] = ..., user_callbacks: Dict[str, _Callback] = ..., g_regex_flags: int = ... @@ -135,6 +136,7 @@ class ContextualLexer(Lexer): self, terminals: Collection[TerminalDef], states: Dict[str, Collection[str]], + re_: ModuleType, ignore: Collection[str] = ..., always_accept: Collection[str] = ..., user_callbacks: Dict[str, _Callback] = ..., From c319ace48d1b0edea506a5364fd04816480e84a7 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Fri, 26 Jun 2020 11:47:00 -0400 Subject: [PATCH 12/12] Update README.md --- README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/README.md b/README.md index 1c7062c..02b89d7 100644 --- a/README.md +++ b/README.md @@ -176,6 +176,27 @@ You can use the output as a regular python module: 0.38981434460254655 ``` +### Using Unicode character classes with `regex` +Python's builtin `re` module has a few persistent known bugs and also won't parse +advanced regex features such as character classes. +With `pip install lark-parser[regex]`, the `regex` module will be installed alongside `lark` +and can act as a drop-in replacement to `re`. + +Any instance of `Lark` instantiated with `regex=True` will now use the `regex` module +instead of `re`. For example, we can now use character classes to match PEP-3131 compliant Python identifiers. +```python +from lark import Lark +>>> g = Lark(r""" + ?start: NAME + NAME: ID_START ID_CONTINUE* + ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ + ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ + """, regex=True) + +>>> g.parse('வணக்கம்') +'வணக்கம்' + +``` ## License