Browse Source

Merge branch 'MegaIng-global_flags'

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.8.3
Erez Sh 4 years ago
parent
commit
c146af9c50
7 changed files with 48 additions and 29 deletions
  1. +3
    -2
      lark/common.py
  2. +4
    -2
      lark/lark.py
  3. +17
    -17
      lark/lexer.py
  4. +4
    -3
      lark/parser_frontends.py
  5. +3
    -1
      lark_stubs/lark.pyi
  6. +4
    -2
      lark_stubs/lexer.pyi
  7. +13
    -2
      tests/test_parser.py

+ 3
- 2
lark/common.py View File

@@ -4,14 +4,15 @@ from .lexer import TerminalDef
###{standalone ###{standalone


class LexerConf(Serialize): class LexerConf(Serialize):
__serialize_fields__ = 'tokens', 'ignore'
__serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags'
__serialize_namespace__ = TerminalDef, __serialize_namespace__ = TerminalDef,


def __init__(self, tokens, ignore=(), postlex=None, callbacks=None):
def __init__(self, tokens, ignore=(), postlex=None, callbacks=None, g_regex_flags=0):
self.tokens = tokens self.tokens = tokens
self.ignore = ignore self.ignore = ignore
self.postlex = postlex self.postlex = postlex
self.callbacks = callbacks or {} self.callbacks = callbacks or {}
self.g_regex_flags = g_regex_flags


def _deserialize(self): def _deserialize(self):
self.callbacks = {} # TODO self.callbacks = {} # TODO


+ 4
- 2
lark/lark.py View File

@@ -48,6 +48,7 @@ class LarkOptions(Serialize):
propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches. propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches.
lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution.
maybe_placeholders - Experimental feature. Instead of omitting optional rules (i.e. rule?), replace them with None maybe_placeholders - Experimental feature. Instead of omitting optional rules (i.e. rule?), replace them with None
g_regex_flags - Flags that are applied to all Terminals (Regex and Strings)
""" """
if __doc__: if __doc__:
__doc__ += OPTIONS_DOC __doc__ += OPTIONS_DOC
@@ -68,6 +69,7 @@ class LarkOptions(Serialize):
'lexer_callbacks': {}, 'lexer_callbacks': {},
'maybe_placeholders': False, 'maybe_placeholders': False,
'edit_terminals': None, 'edit_terminals': None,
'g_regex_flags': 0,
} }


def __init__(self, options_dict): def __init__(self, options_dict):
@@ -209,7 +211,7 @@ class Lark(Serialize):
if hasattr(t, term.name): if hasattr(t, term.name):
lexer_callbacks[term.name] = getattr(t, term.name) lexer_callbacks[term.name] = getattr(t, term.name)


self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, lexer_callbacks)
self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags)


if self.options.parser: if self.options.parser:
self.parser = self._build_parser() self.parser = self._build_parser()
@@ -222,7 +224,7 @@ class Lark(Serialize):
__serialize_fields__ = 'parser', 'rules', 'options' __serialize_fields__ = 'parser', 'rules', 'options'


def _build_lexer(self): def _build_lexer(self):
return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks)
return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags)


def _prepare_callbacks(self): def _prepare_callbacks(self):
self.parser_class = get_frontend(self.options.parser, self.options.lexer) self.parser_class = get_frontend(self.options.parser, self.options.lexer)


+ 17
- 17
lark/lexer.py View File

@@ -230,7 +230,7 @@ class CallChain:






def _create_unless(terminals):
def _create_unless(terminals, g_regex_flags):
tokens_by_type = classify(terminals, lambda t: type(t.pattern)) tokens_by_type = classify(terminals, lambda t: type(t.pattern))
assert len(tokens_by_type) <= 2, tokens_by_type.keys() assert len(tokens_by_type) <= 2, tokens_by_type.keys()
embedded_strs = set() embedded_strs = set()
@@ -241,19 +241,19 @@ def _create_unless(terminals):
if strtok.priority > retok.priority: if strtok.priority > retok.priority:
continue continue
s = strtok.pattern.value s = strtok.pattern.value
m = re.match(retok.pattern.to_regexp(), s)
m = re.match(retok.pattern.to_regexp(), s, g_regex_flags)
if m and m.group(0) == s: if m and m.group(0) == s:
unless.append(strtok) unless.append(strtok)
if strtok.pattern.flags <= retok.pattern.flags: if strtok.pattern.flags <= retok.pattern.flags:
embedded_strs.add(strtok) embedded_strs.add(strtok)
if unless: if unless:
callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True))
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, match_whole=True))


terminals = [t for t in terminals if t not in embedded_strs] terminals = [t for t in terminals if t not in embedded_strs]
return terminals, callback return terminals, callback




def _build_mres(terminals, max_size, match_whole):
def _build_mres(terminals, max_size, g_regex_flags, match_whole):
# Python sets an unreasonable group limit (currently 100) in its re module # Python sets an unreasonable group limit (currently 100) in its re module
# Worse, the only way to know we reached it is by catching an AssertionError! # Worse, the only way to know we reached it is by catching an AssertionError!
# This function recursively tries less and less groups until it's successful. # This function recursively tries less and less groups until it's successful.
@@ -261,17 +261,17 @@ def _build_mres(terminals, max_size, match_whole):
mres = [] mres = []
while terminals: while terminals:
try: try:
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]))
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags)
except AssertionError: # Yes, this is what Python provides us.. :/ except AssertionError: # Yes, this is what Python provides us.. :/
return _build_mres(terminals, max_size//2, match_whole)
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole)


# terms_from_name = {t.name: t for t in terminals[:max_size]} # terms_from_name = {t.name: t for t in terminals[:max_size]}
mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
terminals = terminals[max_size:] terminals = terminals[max_size:]
return mres return mres


def build_mres(terminals, match_whole=False):
return _build_mres(terminals, len(terminals), match_whole)
def build_mres(terminals, g_regex_flags, match_whole=False):
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole)


def _regexp_has_newline(r): def _regexp_has_newline(r):
r"""Expressions that may indicate newlines in a regexp: r"""Expressions that may indicate newlines in a regexp:
@@ -294,7 +294,7 @@ class Lexer(object):


class TraditionalLexer(Lexer): class TraditionalLexer(Lexer):


def __init__(self, terminals, ignore=(), user_callbacks={}):
def __init__(self, terminals, ignore=(), user_callbacks={}, g_regex_flags=0):
assert all(isinstance(t, TerminalDef) for t in terminals), terminals assert all(isinstance(t, TerminalDef) for t in terminals), terminals


terminals = list(terminals) terminals = list(terminals)
@@ -302,7 +302,7 @@ class TraditionalLexer(Lexer):
# Sanitization # Sanitization
for t in terminals: for t in terminals:
try: try:
re.compile(t.pattern.to_regexp())
re.compile(t.pattern.to_regexp(), g_regex_flags)
except re.error: except re.error:
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))


@@ -318,10 +318,10 @@ class TraditionalLexer(Lexer):
terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
self.terminals = terminals self.terminals = terminals
self.user_callbacks = user_callbacks self.user_callbacks = user_callbacks
self.build()
self.build(g_regex_flags)


def build(self):
terminals, self.callback = _create_unless(self.terminals)
def build(self, g_regex_flags=0):
terminals, self.callback = _create_unless(self.terminals, g_regex_flags)
assert all(self.callback.values()) assert all(self.callback.values())


for type_, f in self.user_callbacks.items(): for type_, f in self.user_callbacks.items():
@@ -331,7 +331,7 @@ class TraditionalLexer(Lexer):
else: else:
self.callback[type_] = f self.callback[type_] = f


self.mres = build_mres(terminals)
self.mres = build_mres(terminals, g_regex_flags)


def match(self, stream, pos): def match(self, stream, pos):
for mre, type_from_index in self.mres: for mre, type_from_index in self.mres:
@@ -347,7 +347,7 @@ class TraditionalLexer(Lexer):


class ContextualLexer(Lexer): class ContextualLexer(Lexer):


def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}):
def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0):
tokens_by_name = {} tokens_by_name = {}
for t in terminals: for t in terminals:
assert t.name not in tokens_by_name, t assert t.name not in tokens_by_name, t
@@ -362,12 +362,12 @@ class ContextualLexer(Lexer):
except KeyError: except KeyError:
accepts = set(accepts) | set(ignore) | set(always_accept) accepts = set(accepts) | set(ignore) | set(always_accept)
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks)
lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
lexer_by_tokens[key] = lexer lexer_by_tokens[key] = lexer


self.lexers[state] = lexer self.lexers[state] = lexer


self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks)
self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)


def lex(self, stream, get_parser_state): def lex(self, stream, get_parser_state):
parser_state = get_parser_state() parser_state = get_parser_state()


+ 4
- 3
lark/parser_frontends.py View File

@@ -88,7 +88,7 @@ class WithLexer(_ParserFrontend):
return self._parse(token_stream, start) return self._parse(token_stream, start)


def init_traditional_lexer(self): def init_traditional_lexer(self):
self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks)
self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags)


class LALR_WithLexer(WithLexer): class LALR_WithLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
@@ -112,7 +112,8 @@ class LALR_ContextualLexer(LALR_WithLexer):
self.lexer = ContextualLexer(self.lexer_conf.tokens, states, self.lexer = ContextualLexer(self.lexer_conf.tokens, states,
ignore=self.lexer_conf.ignore, ignore=self.lexer_conf.ignore,
always_accept=always_accept, always_accept=always_accept,
user_callbacks=self.lexer_conf.callbacks)
user_callbacks=self.lexer_conf.callbacks,
g_regex_flags=self.lexer_conf.g_regex_flags)




def parse(self, text, start=None): def parse(self, text, start=None):
@@ -187,7 +188,7 @@ class XEarley(_ParserFrontend):
if width == 0: if width == 0:
raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t) raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t)


self.regexps[t.name] = re.compile(regexp)
self.regexps[t.name] = re.compile(regexp, lexer_conf.g_regex_flags)


def parse(self, text, start): def parse(self, text, start):
return self._parse(text, start) return self._parse(text, start)


+ 3
- 1
lark_stubs/lark.pyi View File

@@ -34,6 +34,7 @@ class LarkOptions:
maybe_placeholders: bool maybe_placeholders: bool
lexer_callbacks: Dict[str, Callable[[Token], Token]] lexer_callbacks: Dict[str, Callable[[Token], Token]]
cache_grammar: bool cache_grammar: bool
g_regex_flags: int




class Lark: class Lark:
@@ -56,7 +57,8 @@ class Lark:
keep_all_tokens: bool = False, keep_all_tokens: bool = False,
propagate_positions: bool = False, propagate_positions: bool = False,
maybe_placeholders: bool = False, maybe_placeholders: bool = False,
lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None
lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None,
g_regex_flags: int = ...
): ):
... ...




+ 4
- 2
lark_stubs/lexer.pyi View File

@@ -112,7 +112,8 @@ class TraditionalLexer(Lexer):
self, self,
terminals: Collection[TerminalDef], terminals: Collection[TerminalDef],
ignore: Collection[str] = ..., ignore: Collection[str] = ...,
user_callbacks: Dict[str, _Callback] = ...
user_callbacks: Dict[str, _Callback] = ...,
g_regex_flags: int = ...
): ):
... ...


@@ -136,7 +137,8 @@ class ContextualLexer(Lexer):
states: Dict[str, Collection[str]], states: Dict[str, Collection[str]],
ignore: Collection[str] = ..., ignore: Collection[str] = ...,
always_accept: Collection[str] = ..., always_accept: Collection[str] = ...,
user_callbacks: Dict[str, _Callback] = ...
user_callbacks: Dict[str, _Callback] = ...,
g_regex_flags: int = ...
): ):
... ...




+ 13
- 2
tests/test_parser.py View File

@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import absolute_import from __future__ import absolute_import


import re
import unittest import unittest
import logging import logging
import os import os
@@ -538,7 +539,7 @@ class CustomLexer(Lexer):
so it uses the traditionalparser as implementation without custom lexing behaviour. so it uses the traditionalparser as implementation without custom lexing behaviour.
""" """
def __init__(self, lexer_conf): def __init__(self, lexer_conf):
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks)
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags)
def lex(self, *args, **kwargs): def lex(self, *args, **kwargs):
return self.lexer.lex(*args, **kwargs) return self.lexer.lex(*args, **kwargs)


@@ -831,7 +832,16 @@ def _make_parser_test(LEXER, PARSER):


x = g.parse("starts") x = g.parse("starts")
self.assertSequenceEqual(x.children, ['starts']) self.assertSequenceEqual(x.children, ['starts'])

def test_g_regex_flags(self):
g = _Lark("""
start: "a" /b+/ C
C: "C" | D
D: "D" E
E: "e"
""", g_regex_flags=re.I)
x1 = g.parse("ABBc")
x2 = g.parse("abdE")


# def test_string_priority(self): # def test_string_priority(self):
# g = _Lark("""start: (A | /a?bb/)+ # g = _Lark("""start: (A | /a?bb/)+
@@ -1701,6 +1711,7 @@ def _make_parser_test(LEXER, PARSER):


_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
_TestParser.__name__ = _NAME _TestParser.__name__ = _NAME
_TestParser.__qualname__ = "tests.test_parser." + _NAME
globals()[_NAME] = _TestParser globals()[_NAME] = _TestParser


# Note: You still have to import them in __main__ for the tests to run # Note: You still have to import them in __main__ for the tests to run


Loading…
Cancel
Save