@@ -4,14 +4,15 @@ from .lexer import TerminalDef | |||
###{standalone | |||
class LexerConf(Serialize): | |||
__serialize_fields__ = 'tokens', 'ignore' | |||
__serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags' | |||
__serialize_namespace__ = TerminalDef, | |||
def __init__(self, tokens, ignore=(), postlex=None, callbacks=None): | |||
def __init__(self, tokens, ignore=(), postlex=None, callbacks=None, g_regex_flags=0): | |||
self.tokens = tokens | |||
self.ignore = ignore | |||
self.postlex = postlex | |||
self.callbacks = callbacks or {} | |||
self.g_regex_flags = g_regex_flags | |||
def _deserialize(self): | |||
self.callbacks = {} # TODO | |||
@@ -48,6 +48,7 @@ class LarkOptions(Serialize): | |||
propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches. | |||
lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. | |||
maybe_placeholders - Experimental feature. Instead of omitting optional rules (i.e. rule?), replace them with None | |||
g_regex_flags - Flags that are applied to all Terminals (Regex and Strings) | |||
""" | |||
if __doc__: | |||
__doc__ += OPTIONS_DOC | |||
@@ -68,6 +69,7 @@ class LarkOptions(Serialize): | |||
'lexer_callbacks': {}, | |||
'maybe_placeholders': False, | |||
'edit_terminals': None, | |||
'g_regex_flags': 0, | |||
} | |||
def __init__(self, options_dict): | |||
@@ -209,7 +211,7 @@ class Lark(Serialize): | |||
if hasattr(t, term.name): | |||
lexer_callbacks[term.name] = getattr(t, term.name) | |||
self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, lexer_callbacks) | |||
self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags) | |||
if self.options.parser: | |||
self.parser = self._build_parser() | |||
@@ -222,7 +224,7 @@ class Lark(Serialize): | |||
__serialize_fields__ = 'parser', 'rules', 'options' | |||
def _build_lexer(self): | |||
return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) | |||
return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags) | |||
def _prepare_callbacks(self): | |||
self.parser_class = get_frontend(self.options.parser, self.options.lexer) | |||
@@ -230,7 +230,7 @@ class CallChain: | |||
def _create_unless(terminals): | |||
def _create_unless(terminals, g_regex_flags): | |||
tokens_by_type = classify(terminals, lambda t: type(t.pattern)) | |||
assert len(tokens_by_type) <= 2, tokens_by_type.keys() | |||
embedded_strs = set() | |||
@@ -241,19 +241,19 @@ def _create_unless(terminals): | |||
if strtok.priority > retok.priority: | |||
continue | |||
s = strtok.pattern.value | |||
m = re.match(retok.pattern.to_regexp(), s) | |||
m = re.match(retok.pattern.to_regexp(), s, g_regex_flags) | |||
if m and m.group(0) == s: | |||
unless.append(strtok) | |||
if strtok.pattern.flags <= retok.pattern.flags: | |||
embedded_strs.add(strtok) | |||
if unless: | |||
callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True)) | |||
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, match_whole=True)) | |||
terminals = [t for t in terminals if t not in embedded_strs] | |||
return terminals, callback | |||
def _build_mres(terminals, max_size, match_whole): | |||
def _build_mres(terminals, max_size, g_regex_flags, match_whole): | |||
# Python sets an unreasonable group limit (currently 100) in its re module | |||
# Worse, the only way to know we reached it is by catching an AssertionError! | |||
# This function recursively tries less and less groups until it's successful. | |||
@@ -261,17 +261,17 @@ def _build_mres(terminals, max_size, match_whole): | |||
mres = [] | |||
while terminals: | |||
try: | |||
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size])) | |||
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags) | |||
except AssertionError: # Yes, this is what Python provides us.. :/ | |||
return _build_mres(terminals, max_size//2, match_whole) | |||
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole) | |||
# terms_from_name = {t.name: t for t in terminals[:max_size]} | |||
mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) | |||
terminals = terminals[max_size:] | |||
return mres | |||
def build_mres(terminals, match_whole=False): | |||
return _build_mres(terminals, len(terminals), match_whole) | |||
def build_mres(terminals, g_regex_flags, match_whole=False): | |||
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole) | |||
def _regexp_has_newline(r): | |||
r"""Expressions that may indicate newlines in a regexp: | |||
@@ -294,7 +294,7 @@ class Lexer(object): | |||
class TraditionalLexer(Lexer): | |||
def __init__(self, terminals, ignore=(), user_callbacks={}): | |||
def __init__(self, terminals, ignore=(), user_callbacks={}, g_regex_flags=0): | |||
assert all(isinstance(t, TerminalDef) for t in terminals), terminals | |||
terminals = list(terminals) | |||
@@ -302,7 +302,7 @@ class TraditionalLexer(Lexer): | |||
# Sanitization | |||
for t in terminals: | |||
try: | |||
re.compile(t.pattern.to_regexp()) | |||
re.compile(t.pattern.to_regexp(), g_regex_flags) | |||
except re.error: | |||
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) | |||
@@ -318,10 +318,10 @@ class TraditionalLexer(Lexer): | |||
terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) | |||
self.terminals = terminals | |||
self.user_callbacks = user_callbacks | |||
self.build() | |||
self.build(g_regex_flags) | |||
def build(self): | |||
terminals, self.callback = _create_unless(self.terminals) | |||
def build(self, g_regex_flags=0): | |||
terminals, self.callback = _create_unless(self.terminals, g_regex_flags) | |||
assert all(self.callback.values()) | |||
for type_, f in self.user_callbacks.items(): | |||
@@ -331,7 +331,7 @@ class TraditionalLexer(Lexer): | |||
else: | |||
self.callback[type_] = f | |||
self.mres = build_mres(terminals) | |||
self.mres = build_mres(terminals, g_regex_flags) | |||
def match(self, stream, pos): | |||
for mre, type_from_index in self.mres: | |||
@@ -347,7 +347,7 @@ class TraditionalLexer(Lexer): | |||
class ContextualLexer(Lexer): | |||
def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): | |||
def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0): | |||
tokens_by_name = {} | |||
for t in terminals: | |||
assert t.name not in tokens_by_name, t | |||
@@ -362,12 +362,12 @@ class ContextualLexer(Lexer): | |||
except KeyError: | |||
accepts = set(accepts) | set(ignore) | set(always_accept) | |||
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] | |||
lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) | |||
lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) | |||
lexer_by_tokens[key] = lexer | |||
self.lexers[state] = lexer | |||
self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks) | |||
self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) | |||
def lex(self, stream, get_parser_state): | |||
parser_state = get_parser_state() | |||
@@ -88,7 +88,7 @@ class WithLexer(_ParserFrontend): | |||
return self._parse(token_stream, start) | |||
def init_traditional_lexer(self): | |||
self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) | |||
self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags) | |||
class LALR_WithLexer(WithLexer): | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
@@ -112,7 +112,8 @@ class LALR_ContextualLexer(LALR_WithLexer): | |||
self.lexer = ContextualLexer(self.lexer_conf.tokens, states, | |||
ignore=self.lexer_conf.ignore, | |||
always_accept=always_accept, | |||
user_callbacks=self.lexer_conf.callbacks) | |||
user_callbacks=self.lexer_conf.callbacks, | |||
g_regex_flags=self.lexer_conf.g_regex_flags) | |||
def parse(self, text, start=None): | |||
@@ -187,7 +188,7 @@ class XEarley(_ParserFrontend): | |||
if width == 0: | |||
raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t) | |||
self.regexps[t.name] = re.compile(regexp) | |||
self.regexps[t.name] = re.compile(regexp, lexer_conf.g_regex_flags) | |||
def parse(self, text, start): | |||
return self._parse(text, start) | |||
@@ -34,6 +34,7 @@ class LarkOptions: | |||
maybe_placeholders: bool | |||
lexer_callbacks: Dict[str, Callable[[Token], Token]] | |||
cache_grammar: bool | |||
g_regex_flags: int | |||
class Lark: | |||
@@ -56,7 +57,8 @@ class Lark: | |||
keep_all_tokens: bool = False, | |||
propagate_positions: bool = False, | |||
maybe_placeholders: bool = False, | |||
lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None | |||
lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None, | |||
g_regex_flags: int = ... | |||
): | |||
... | |||
@@ -112,7 +112,8 @@ class TraditionalLexer(Lexer): | |||
self, | |||
terminals: Collection[TerminalDef], | |||
ignore: Collection[str] = ..., | |||
user_callbacks: Dict[str, _Callback] = ... | |||
user_callbacks: Dict[str, _Callback] = ..., | |||
g_regex_flags: int = ... | |||
): | |||
... | |||
@@ -136,7 +137,8 @@ class ContextualLexer(Lexer): | |||
states: Dict[str, Collection[str]], | |||
ignore: Collection[str] = ..., | |||
always_accept: Collection[str] = ..., | |||
user_callbacks: Dict[str, _Callback] = ... | |||
user_callbacks: Dict[str, _Callback] = ..., | |||
g_regex_flags: int = ... | |||
): | |||
... | |||
@@ -1,6 +1,7 @@ | |||
# -*- coding: utf-8 -*- | |||
from __future__ import absolute_import | |||
import re | |||
import unittest | |||
import logging | |||
import os | |||
@@ -538,7 +539,7 @@ class CustomLexer(Lexer): | |||
so it uses the traditionalparser as implementation without custom lexing behaviour. | |||
""" | |||
def __init__(self, lexer_conf): | |||
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) | |||
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags) | |||
def lex(self, *args, **kwargs): | |||
return self.lexer.lex(*args, **kwargs) | |||
@@ -831,7 +832,16 @@ def _make_parser_test(LEXER, PARSER): | |||
x = g.parse("starts") | |||
self.assertSequenceEqual(x.children, ['starts']) | |||
def test_g_regex_flags(self): | |||
g = _Lark(""" | |||
start: "a" /b+/ C | |||
C: "C" | D | |||
D: "D" E | |||
E: "e" | |||
""", g_regex_flags=re.I) | |||
x1 = g.parse("ABBc") | |||
x2 = g.parse("abdE") | |||
# def test_string_priority(self): | |||
# g = _Lark("""start: (A | /a?bb/)+ | |||
@@ -1701,6 +1711,7 @@ def _make_parser_test(LEXER, PARSER): | |||
_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | |||
_TestParser.__name__ = _NAME | |||
_TestParser.__qualname__ = "tests.test_parser." + _NAME | |||
globals()[_NAME] = _TestParser | |||
# Note: You still have to import them in __main__ for the tests to run | |||