Merge branch 'regex' of https://github.com/julienmalard/lark into julienmalard-regex

5 years ago · d5e6ac6611
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -22,7 +22,7 @@ jobs:
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -r nearley-requirements.txt
          pip install -r test-requirements.txt
      - name: Run tests
        run: |
          python -m tests
--- a/README.md
+++ b/README.md
@@ -177,6 +177,27 @@ You can use the output as a regular python module:
 0.38981434460254655
 ```

 ### Using Unicode character classes with `regex`
 Python's builtin `re` module has a few persistent known bugs and also won't parse
 advanced regex features such as character classes.
 With `pip install lark-parser[regex]`, the `regex` module will be installed alongside `lark`
 and can act as a drop-in replacement to `re`.

 Any instance of `Lark` instantiated with `regex=True` will now use the `regex` module
 instead of `re`. For example, we can now use character classes to match PEP-3131 compliant Python identifiers. 
 ```python
 from lark import Lark
 >>> g = Lark(r"""
                    ?start: NAME
                    NAME: ID_START ID_CONTINUE*
                    ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/
                    ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/
                """, regex=True)

 >>> g.parse('வணக்கம்') 
 'வணக்கம்'

 ```

 ## License

--- a/lark-stubs/lark.pyi
+++ b/lark-stubs/lark.pyi
@@ -23,6 +23,7 @@ class LarkOptions:
    transformer: Optional[Transformer]
    postlex: Optional[PostLex]
    ambiguity: str
    regex: bool
    debug: bool
    keep_all_tokens: bool
    propagate_positions: bool
@@ -48,6 +49,7 @@ class Lark:
        transformer: Optional[Transformer] = None,
        postlex: Optional[PostLex] = None,
        ambiguity: Literal["explicit", "resolve"] = "resolve",
        regex: bool = False,
        debug: bool = False,
        keep_all_tokens: bool = False,
        propagate_positions: bool = False,
--- a/lark-stubs/lexer.pyi
+++ b/lark-stubs/lexer.pyi
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-

 from types import ModuleType
 from typing import (
    TypeVar, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional,
    Pattern as REPattern,
@@ -111,6 +111,7 @@ class TraditionalLexer(Lexer):
    def __init__(
        self,
        terminals: Collection[TerminalDef],
        re_: ModuleType,
        ignore: Collection[str] = ...,
        user_callbacks: Dict[str, _Callback] = ...,
        g_regex_flags: int = ...
@@ -135,6 +136,7 @@ class ContextualLexer(Lexer):
        self,
        terminals: Collection[TerminalDef],
        states: Dict[str, Collection[str]],
        re_: ModuleType,
        ignore: Collection[str] = ...,
        always_accept: Collection[str] = ...,
        user_callbacks: Dict[str, _Callback] = ...,
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -14,6 +14,12 @@ from .parse_tree_builder import ParseTreeBuilder
 from .parser_frontends import get_frontend
 from .grammar import Rule

 import re
 try:
    import regex
 except ImportError:
    regex = None

 ###{standalone

 class LarkOptions(Serialize):
@@ -34,6 +40,7 @@ class LarkOptions(Serialize):
                         When `False`,  `[]` behaves like the `?` operator,
                             and returns no value at all.
                         (default=`False`. Recommended to set to `True`)
    regex - When True, uses the `regex` module instead of the stdlib `re`.
    cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading.
            LALR only for now.
        When `False`, does nothing (default)
@@ -92,6 +99,7 @@ class LarkOptions(Serialize):
        'start': 'start',
        'priority': 'auto',
        'ambiguity': 'auto',
        'regex': False,
        'propagate_positions': False,
        'lexer_callbacks': {},
        'maybe_placeholders': False,
@@ -154,6 +162,16 @@ class Lark(Serialize):

        self.options = LarkOptions(options)

        # Set regex or re module
        use_regex = self.options.regex
        if use_regex:
            if regex:
                self.re = regex
            else:
                raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.')
        else:
            self.re = re

        # Some, but not all file-like objects have a 'name' attribute
        try:
            self.source = grammar.name
@@ -225,7 +243,7 @@ class Lark(Serialize):
        assert self.options.ambiguity in ('resolve', 'explicit', 'auto', )

        # Parse the grammar file and compose the grammars (TODO)
        self.grammar = load_grammar(grammar, self.source)
        self.grammar = load_grammar(grammar, self.source, self.re)

        # Compile the EBNF grammar into BNF
        self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start)
@@ -286,7 +304,7 @@ class Lark(Serialize):
    def _build_parser(self):
        self._prepare_callbacks()
        parser_conf = ParserConf(self.rules, self._callbacks, self.options.start)
        return self.parser_class(self.lexer_conf, parser_conf, options=self.options)
        return self.parser_class(self.lexer_conf, parser_conf, self.re, options=self.options)

    def save(self, f):
        data, m = self.memo_serialize([TerminalDef, Rule])
@@ -313,10 +331,11 @@ class Lark(Serialize):
        if postlex is not None:
            options['postlex'] = postlex
        self.options = LarkOptions.deserialize(options, memo)
        self.re = regex if self.options.regex else re
        self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
        self.source = '<deserialized>'
        self._prepare_callbacks()
        self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex)
        self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, self.re)
        return self

    @classmethod
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -1,6 +1,10 @@
 ## Lexer Implementation

 import re
 try:
    import regex
 except ImportError:
    regex = None

 from .utils import Str, classify, get_regexp_width, Py36, Serialize
 from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken
@@ -230,7 +234,7 @@ class CallChain:



 def _create_unless(terminals, g_regex_flags):
 def _create_unless(terminals, g_regex_flags, re_):
    tokens_by_type = classify(terminals, lambda t: type(t.pattern))
    assert len(tokens_by_type) <= 2, tokens_by_type.keys()
    embedded_strs = set()
@@ -241,7 +245,7 @@ def _create_unless(terminals, g_regex_flags):
            if strtok.priority > retok.priority:
                continue
            s = strtok.pattern.value
            m = re.match(retok.pattern.to_regexp(), s, g_regex_flags)
            m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags)
            if m and m.group(0) == s:
                unless.append(strtok)
                if strtok.pattern.flags <= retok.pattern.flags:
@@ -294,16 +298,17 @@ class Lexer(object):

 class TraditionalLexer(Lexer):

    def __init__(self, terminals, ignore=(), user_callbacks={}, g_regex_flags=0):
    def __init__(self, terminals, re_, ignore=(), user_callbacks={}, g_regex_flags=0):
        assert all(isinstance(t, TerminalDef) for t in terminals), terminals

        terminals = list(terminals)

        self.re = re_
        # Sanitization
        for t in terminals:
            try:
                re.compile(t.pattern.to_regexp(), g_regex_flags)
            except re.error:
                self.re.compile(t.pattern.to_regexp(), g_regex_flags)
            except self.re.error:
                raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))

            if t.pattern.min_width == 0:
@@ -321,7 +326,7 @@ class TraditionalLexer(Lexer):
        self.build(g_regex_flags)

    def build(self, g_regex_flags=0):
        terminals, self.callback = _create_unless(self.terminals, g_regex_flags)
        terminals, self.callback = _create_unless(self.terminals, g_regex_flags, re_=self.re)
        assert all(self.callback.values())

        for type_, f in self.user_callbacks.items():
@@ -347,7 +352,8 @@ class TraditionalLexer(Lexer):

 class ContextualLexer(Lexer):

    def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0):
    def __init__(self, terminals, states, re_, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0):
        self.re = re_
        tokens_by_name = {}
        for t in terminals:
            assert t.name not in tokens_by_name, t
@@ -362,12 +368,12 @@ class ContextualLexer(Lexer):
            except KeyError:
                accepts = set(accepts) | set(ignore) | set(always_accept)
                state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
                lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
                lexer = TraditionalLexer(state_tokens, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
                lexer_by_tokens[key] = lexer

            self.lexers[state] = lexer

        self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
        self.root_lexer = TraditionalLexer(terminals, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)

    def lex(self, stream, get_parser_state):
        parser_state = get_parser_state()
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -616,7 +616,7 @@ class Grammar:


 _imported_grammars = {}
 def import_grammar(grammar_path, base_paths=[]):
 def import_grammar(grammar_path, re_, base_paths=[]):
    if grammar_path not in _imported_grammars:
        import_paths = base_paths + IMPORT_PATHS
        for import_path in import_paths:
@@ -624,7 +624,7 @@ def import_grammar(grammar_path, base_paths=[]):
                joined_path = os.path.join(import_path, grammar_path)
                with open(joined_path, encoding='utf8') as f:
                    text = f.read()
                grammar = load_grammar(text, joined_path)
                grammar = load_grammar(text, joined_path, re_)
                _imported_grammars[grammar_path] = grammar
                break
        else:
@@ -755,7 +755,8 @@ def _find_used_symbols(tree):
              for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}

 class GrammarLoader:
    def __init__(self):
    def __init__(self, re_):
        self.re = re_
        terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]

        rules = [options_from_rule(name, None, x) for name, x in  RULES.items()]
@@ -764,7 +765,7 @@ class GrammarLoader:
        lexer_conf = LexerConf(terminals, ['WS', 'COMMENT'])

        parser_conf = ParserConf(rules, callback, ['start'])
        self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)
        self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf, re_)

        self.canonize_tree = CanonizeTree()

@@ -862,7 +863,7 @@ class GrammarLoader:
        # import grammars
        for dotted_path, (base_paths, aliases) in imports.items():
            grammar_path = os.path.join(*dotted_path) + EXT
            g = import_grammar(grammar_path, base_paths=base_paths)
            g = import_grammar(grammar_path, self.re, base_paths=base_paths)
            new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases)

            term_defs += new_td
@@ -942,4 +943,5 @@ class GrammarLoader:



 load_grammar = GrammarLoader().load_grammar
 def load_grammar(grammar, source, re_):
    return GrammarLoader(re_).load_grammar(grammar, source)
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -1,4 +1,3 @@
 import re
 from functools import partial

 from .utils import get_regexp_width, Serialize
@@ -63,14 +62,16 @@ class WithLexer(_ParserFrontend):
    __serialize_fields__ = 'parser', 'lexer_conf', 'start'
    __serialize_namespace__ = LexerConf,

    def __init__(self, lexer_conf, parser_conf, options=None):
    def __init__(self, lexer_conf, parser_conf, re_, options=None):
        self.lexer_conf = lexer_conf
        self.start = parser_conf.start
        self.postlex = lexer_conf.postlex
        self.re = re_

    @classmethod
    def deserialize(cls, data, memo, callbacks, postlex):
    def deserialize(cls, data, memo, callbacks, postlex, re_):
        inst = super(WithLexer, cls).deserialize(data, memo)
        inst.re = re_
        inst.postlex = postlex
        inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks)
        inst.init_lexer()
@@ -88,13 +89,14 @@ class WithLexer(_ParserFrontend):
        return self._parse(token_stream, start)

    def init_traditional_lexer(self):
        self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags)
        self.lexer = TraditionalLexer(self.lexer_conf.tokens, re_=self.re, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags)

 class LALR_WithLexer(WithLexer):
    def __init__(self, lexer_conf, parser_conf, options=None):
    def __init__(self, lexer_conf, parser_conf, re_, options=None):
        debug = options.debug if options else False
        self.re = re_
        self.parser = LALR_Parser(parser_conf, debug=debug)
        WithLexer.__init__(self, lexer_conf, parser_conf, options)
        WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)

        self.init_lexer()

@@ -110,6 +112,7 @@ class LALR_ContextualLexer(LALR_WithLexer):
        states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
        always_accept = self.postlex.always_accept if self.postlex else ()
        self.lexer = ContextualLexer(self.lexer_conf.tokens, states,
                                     re_=self.re,
                                     ignore=self.lexer_conf.ignore,
                                     always_accept=always_accept,
                                     user_callbacks=self.lexer_conf.callbacks,
@@ -126,11 +129,11 @@ class LALR_ContextualLexer(LALR_WithLexer):
 ###}

 class LALR_CustomLexer(LALR_WithLexer):
    def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
        self.lexer = lexer_cls(lexer_conf)
    def __init__(self, lexer_cls, lexer_conf, parser_conf, re_, options=None):
        self.lexer = lexer_cls(lexer_conf, re_=re_)
        debug = options.debug if options else False
        self.parser = LALR_Parser(parser_conf, debug=debug)
        WithLexer.__init__(self, lexer_conf, parser_conf, options)
        WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)


 def tokenize_text(text):
@@ -143,8 +146,8 @@ def tokenize_text(text):
        yield Token('CHAR', ch, line=line, column=i - col_start_pos)

 class Earley(WithLexer):
    def __init__(self, lexer_conf, parser_conf, options=None):
        WithLexer.__init__(self, lexer_conf, parser_conf, options)
    def __init__(self, lexer_conf, parser_conf, re_, options=None):
        WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)
        self.init_traditional_lexer()

        resolve_ambiguity = options.ambiguity == 'resolve'
@@ -156,7 +159,9 @@ class Earley(WithLexer):


 class XEarley(_ParserFrontend):
    def __init__(self, lexer_conf, parser_conf, options=None, **kw):
    def __init__(self, lexer_conf, parser_conf, re_, options=None, **kw):
        self.re = re_

        self.token_by_name = {t.name:t for t in lexer_conf.tokens}
        self.start = parser_conf.start

@@ -188,7 +193,7 @@ class XEarley(_ParserFrontend):
                if width == 0:
                    raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t)

            self.regexps[t.name] = re.compile(regexp, lexer_conf.g_regex_flags)
            self.regexps[t.name] = self.re.compile(regexp, lexer_conf.g_regex_flags)

    def parse(self, text, start):
        return self._parse(text, start)
@@ -201,8 +206,8 @@ class XEarley_CompleteLex(XEarley):

 class CYK(WithLexer):

    def __init__(self, lexer_conf, parser_conf, options=None):
        WithLexer.__init__(self, lexer_conf, parser_conf, options)
    def __init__(self, lexer_conf, parser_conf, re_, options=None):
        WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)
        self.init_traditional_lexer()

        self._analysis = GrammarAnalyzer(parser_conf)
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -165,14 +165,29 @@ def smart_decorator(f, create_decorator):
    else:
        return create_decorator(f.__func__.__call__, True)

 try:
    import regex
 except ImportError:
    regex = None

 import sys, re
 Py36 = (sys.version_info[:2] >= (3, 6))

 import sre_parse
 import sre_constants
 categ_pattern = re.compile(r'\\p{[A-Za-z_]+}')
 def get_regexp_width(regexp):
    if regex:
        # Since `sre_parse` cannot deal with Unicode categories of the form `\p{Mn}`, we replace these with
        # a simple letter, which makes no difference as we are only trying to get the possible lengths of the regex
        # match here below.
        regexp_final = re.sub(categ_pattern, 'A', regexp)
    else:
        if re.search(categ_pattern, regexp):
            raise ImportError('`regex` module must be installed in order to use Unicode categories.', regexp)
        regexp_final = regexp
    try:
        return [int(x) for x in sre_parse.parse(regexp).getwidth()]
        return [int(x) for x in sre_parse.parse(regexp_final).getwidth()]
    except sre_constants.error:
        raise ValueError(regexp)

@@ -182,7 +197,7 @@ def get_regexp_width(regexp):
 def dedup_list(l):
    """Given a list (l) will removing duplicates from the list,
       preserving the original order of the list. Assumes that
       the list entrie are hashable."""
       the list entries are hashable."""
    dedup = set()
    return [ x for x in l if not (x in dedup or dedup.add(x))]

--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,7 @@
 import re
 try:
    import regex as re
 except ImportError:
    import re
 from setuptools import find_packages, setup

 __version__ ,= re.findall('__version__ = "(.*)"', open('lark/__init__.py').read())
@@ -11,6 +14,10 @@ setup(
    requires = [],
    install_requires = [],

    extras_require = {
        "regex": ["regex"]
    },

    package_data = {'': ['*.md', '*.lark'], 'lark-stubs': ['*.pyi']},

    test_suite = 'tests.__main__',
--- a/nearley-requirements.txt
+++ b/nearley-requirements.txt
@@ -1 +1,2 @@
 Js2Py==0.68
 regex
--- a/tests/main.py
+++ b/tests/main.py
@@ -7,7 +7,7 @@ from .test_trees import TestTrees
 from .test_tools import TestStandalone
 from .test_cache import TestCache
 from .test_reconstructor import TestReconstructor

 from .test_regex import TestRegex
 try:
    from .test_nearley.test_nearley import TestNearley
 except ImportError:
--- a/tests/test_nearley/nearley
+++ b/tests/test_nearley/nearley
@@ -1 +1 @@
 Subproject commit a46b37471db486db0f6e1ce6a2934fb238346b44
 Subproject commit cf8925f729bde741a3076c5856c0c0862bc7f5de
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -1,7 +1,10 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import

 import re
 try:
    import regex as re
 except ImportError:
    import re
 import unittest
 import logging
 import os
@@ -548,8 +551,8 @@ class CustomLexer(Lexer):
    Purpose of this custom lexer is to test the integration,
    so it uses the traditionalparser as implementation without custom lexing behaviour.
    """
    def __init__(self, lexer_conf):
        self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags)
    def __init__(self, lexer_conf, re_):
        self.lexer = TraditionalLexer(lexer_conf.tokens, re_, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags)
    def lex(self, *args, **kwargs):
        return self.lexer.lex(*args, **kwargs)

--- a/tests/test_regex.py
+++ b/tests/test_regex.py
@@ -0,0 +1,37 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import

 import logging
 import sys
 import unittest

 logging.basicConfig(level=logging.INFO)

 from lark.lark import Lark


 class TestRegex(unittest.TestCase):
    @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
    def test_unicode_class(self):
        "Tests that character classes from the `regex` module work correctly."
        g = Lark(r"""
                    ?start: NAME
                    NAME: ID_START ID_CONTINUE*
                    ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/
                    ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/
                """, regex=True)

        self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')

    @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
    def test_unicode_word(self):
        "Tests that a persistent bug in the `re` module works when `regex` is enabled."
        g = Lark(r"""
                    ?start: NAME
                    NAME: /[\w]+/
                """, regex=True)
        self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')


 if __name__ == '__main__':
    unittest.main()
--- a/tox.ini
+++ b/tox.ini
@@ -15,6 +15,10 @@ pypy3 = pypy3
 whitelist_externals = git
 deps =
    -rnearley-requirements.txt
    -rregex-requirements.txt

 # For regex testing
 extras = regex

 # to always force recreation and avoid unexpected side effects
 recreate=True