lexer.py: Refactored mres operations into a Scanner class.

4 years ago · 389e7fbf5c
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -127,7 +127,7 @@ class Token(Str):
        end_column: The next column after the end of the token. For example,
            if the token is a single character with a column value of 4,
            end_column will be 5.
        end_pos: the index where the token ends (basically ``pos_in_stream + len(token)``)
        end_pos: the index where the token ends (basically ``start_pos + len(token)``)
    """
    __slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos')

@@ -214,15 +214,13 @@ class LineCounter:


 class UnlessCallback:
    def __init__(self, mres):
        self.mres = mres
    def __init__(self, scanner):
        self.scanner = scanner

    def __call__(self, t):
        for mre, type_from_index in self.mres:
            m = mre.match(t.value)
            if m:
                t.type = type_from_index[m.lastindex]
                break
        res = self.scanner.match(t.value, 0)
        if res:
            _value, t.type = res
        return t


@@ -254,34 +252,51 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes):
                if strtok.pattern.flags <= retok.pattern.flags:
                    embedded_strs.add(strtok)
        if unless:
            callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))
            callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))

    terminals = [t for t in terminals if t not in embedded_strs]
    return terminals, callback


 def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes):
    # Python sets an unreasonable group limit (currently 100) in its re module
    # Worse, the only way to know we reached it is by catching an AssertionError!
    # This function recursively tries less and less groups until it's successful.
    postfix = '$' if match_whole else ''
    mres = []
    while terminals:
        pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
        if use_bytes:
            pattern = pattern.encode('latin-1')
        try:
            mre = re_.compile(pattern, g_regex_flags)
        except AssertionError:  # Yes, this is what Python provides us.. :/
            return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes)

        mres.append((mre, {i: n for n, i in mre.groupindex.items()}))
        terminals = terminals[max_size:]
    return mres
 class Scanner:
    def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False):
        self.terminals = terminals
        self.g_regex_flags = g_regex_flags
        self.re_ = re_
        self.use_bytes = use_bytes
        self.match_whole = match_whole

        self._mres = self._build_mres(terminals, len(terminals))

    def _build_mres(self, terminals, max_size):
        # Python sets an unreasonable group limit (currently 100) in its re module
        # Worse, the only way to know we reached it is by catching an AssertionError!
        # This function recursively tries less and less groups until it's successful.
        postfix = '$' if self.match_whole else ''
        mres = []
        while terminals:
            pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
            if self.use_bytes:
                pattern = pattern.encode('latin-1')
            try:
                mre = self.re_.compile(pattern, self.g_regex_flags)
            except AssertionError:  # Yes, this is what Python provides us.. :/
                return self._build_mres(terminals, max_size//2)

            mres.append((mre, {i: n for n, i in mre.groupindex.items()}))
            terminals = terminals[max_size:]
        return mres

 def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False):
    return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes)
    def match(self, text, pos):
        for mre, type_from_index in self._mres:
            m = mre.match(text, pos)
            if m:
                return m.group(0), type_from_index[m.lastindex]

    @property
    def allowed_types(self):
        return {v for m, tfi in self._mres for v in tfi.values()}


 def _regexp_has_newline(r):
@@ -341,9 +356,9 @@ class TraditionalLexer(Lexer):
        self.use_bytes = conf.use_bytes
        self.terminals_by_name = conf.terminals_by_name

        self._mres = None
        self._scanner = None

    def _build(self):
    def _build_scanner(self):
        terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes)
        assert all(self.callback.values())

@@ -354,19 +369,16 @@ class TraditionalLexer(Lexer):
            else:
                self.callback[type_] = f

        self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes)
        self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes)

    @property
    def mres(self):
        if self._mres is None:
            self._build()
        return self._mres
    def scanner(self):
        if self._scanner is None:
            self._build_scanner()
        return self._scanner

    def match(self, text, pos):
        for mre, type_from_index in self.mres:
            m = mre.match(text, pos)
            if m:
                return m.group(0), type_from_index[m.lastindex]
        return self.scanner.match(text, pos)

    def lex(self, state, parser_state):
        with suppress(EOFError):
@@ -378,7 +390,7 @@ class TraditionalLexer(Lexer):
        while line_ctr.char_pos < len(lex_state.text):
            res = self.match(lex_state.text, line_ctr.char_pos)
            if not res:
                allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types
                allowed = self.scanner.allowed_types - self.ignore_types
                if not allowed:
                    allowed = {"<END-OF-FILE>"}
                raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,