@@ -23,6 +23,7 @@ class LarkOptions: | |||
transformer: Optional[Transformer] | |||
postlex: Optional[PostLex] | |||
ambiguity: str | |||
regex: bool | |||
debug: bool | |||
keep_all_tokens: bool | |||
propagate_positions: bool | |||
@@ -48,6 +49,7 @@ class Lark: | |||
transformer: Optional[Transformer] = None, | |||
postlex: Optional[PostLex] = None, | |||
ambiguity: Literal["explicit", "resolve"] = "resolve", | |||
regex: bool = False, | |||
debug: bool = False, | |||
keep_all_tokens: bool = False, | |||
propagate_positions: bool = False, | |||
@@ -14,6 +14,12 @@ from .parse_tree_builder import ParseTreeBuilder | |||
from .parser_frontends import get_frontend | |||
from .grammar import Rule | |||
import re | |||
try: | |||
import regex | |||
except ImportError: | |||
regex = None | |||
###{standalone | |||
class LarkOptions(Serialize): | |||
@@ -34,6 +40,7 @@ class LarkOptions(Serialize): | |||
When `False`, `[]` behaves like the `?` operator, | |||
and returns no value at all. | |||
(default=`False`. Recommended to set to `True`) | |||
regex - When True, uses the `regex` module instead of the stdlib `re`. | |||
cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. | |||
LALR only for now. | |||
When `False`, does nothing (default) | |||
@@ -92,6 +99,7 @@ class LarkOptions(Serialize): | |||
'start': 'start', | |||
'priority': 'auto', | |||
'ambiguity': 'auto', | |||
'regex': False, | |||
'propagate_positions': False, | |||
'lexer_callbacks': {}, | |||
'maybe_placeholders': False, | |||
@@ -154,6 +162,16 @@ class Lark(Serialize): | |||
self.options = LarkOptions(options) | |||
# Set regex or re module | |||
use_regex = self.options.regex | |||
if use_regex: | |||
if regex: | |||
self.re = regex | |||
else: | |||
raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.') | |||
else: | |||
self.re = re | |||
# Some, but not all file-like objects have a 'name' attribute | |||
try: | |||
self.source = grammar.name | |||
@@ -224,7 +242,7 @@ class Lark(Serialize): | |||
assert self.options.ambiguity in ('resolve', 'explicit', 'auto', ) | |||
# Parse the grammar file and compose the grammars (TODO) | |||
self.grammar = load_grammar(grammar, self.source) | |||
self.grammar = load_grammar(grammar, self.source, self.re) | |||
# Compile the EBNF grammar into BNF | |||
self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) | |||
@@ -285,7 +303,7 @@ class Lark(Serialize): | |||
def _build_parser(self): | |||
self._prepare_callbacks() | |||
parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) | |||
return self.parser_class(self.lexer_conf, parser_conf, options=self.options) | |||
return self.parser_class(self.lexer_conf, parser_conf, self.re, options=self.options) | |||
def save(self, f): | |||
data, m = self.memo_serialize([TerminalDef, Rule]) | |||
@@ -312,10 +330,11 @@ class Lark(Serialize): | |||
if postlex is not None: | |||
options['postlex'] = postlex | |||
self.options = LarkOptions.deserialize(options, memo) | |||
self.re = regex if self.options.regex else re | |||
self.rules = [Rule.deserialize(r, memo) for r in data['rules']] | |||
self.source = '<deserialized>' | |||
self._prepare_callbacks() | |||
self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex) | |||
self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, self.re) | |||
return self | |||
@classmethod | |||
@@ -1,9 +1,10 @@ | |||
## Lexer Implementation | |||
import re | |||
try: | |||
import regex as re | |||
import regex | |||
except ImportError: | |||
import re | |||
regex = None | |||
from .utils import Str, classify, get_regexp_width, Py36, Serialize | |||
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken | |||
@@ -233,7 +234,7 @@ class CallChain: | |||
def _create_unless(terminals, g_regex_flags): | |||
def _create_unless(terminals, g_regex_flags, re_): | |||
tokens_by_type = classify(terminals, lambda t: type(t.pattern)) | |||
assert len(tokens_by_type) <= 2, tokens_by_type.keys() | |||
embedded_strs = set() | |||
@@ -244,7 +245,7 @@ def _create_unless(terminals, g_regex_flags): | |||
if strtok.priority > retok.priority: | |||
continue | |||
s = strtok.pattern.value | |||
m = re.match(retok.pattern.to_regexp(), s, g_regex_flags) | |||
m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags) | |||
if m and m.group(0) == s: | |||
unless.append(strtok) | |||
if strtok.pattern.flags <= retok.pattern.flags: | |||
@@ -297,16 +298,17 @@ class Lexer(object): | |||
class TraditionalLexer(Lexer): | |||
def __init__(self, terminals, ignore=(), user_callbacks={}, g_regex_flags=0): | |||
def __init__(self, terminals, re_, ignore=(), user_callbacks={}, g_regex_flags=0): | |||
assert all(isinstance(t, TerminalDef) for t in terminals), terminals | |||
terminals = list(terminals) | |||
self.re = re_ | |||
# Sanitization | |||
for t in terminals: | |||
try: | |||
re.compile(t.pattern.to_regexp(), g_regex_flags) | |||
except re.error: | |||
self.re.compile(t.pattern.to_regexp(), g_regex_flags) | |||
except self.re.error: | |||
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) | |||
if t.pattern.min_width == 0: | |||
@@ -324,7 +326,7 @@ class TraditionalLexer(Lexer): | |||
self.build(g_regex_flags) | |||
def build(self, g_regex_flags=0): | |||
terminals, self.callback = _create_unless(self.terminals, g_regex_flags) | |||
terminals, self.callback = _create_unless(self.terminals, g_regex_flags, re_=self.re) | |||
assert all(self.callback.values()) | |||
for type_, f in self.user_callbacks.items(): | |||
@@ -350,7 +352,8 @@ class TraditionalLexer(Lexer): | |||
class ContextualLexer(Lexer): | |||
def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0): | |||
def __init__(self, terminals, states, re_, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0): | |||
self.re = re_ | |||
tokens_by_name = {} | |||
for t in terminals: | |||
assert t.name not in tokens_by_name, t | |||
@@ -365,12 +368,12 @@ class ContextualLexer(Lexer): | |||
except KeyError: | |||
accepts = set(accepts) | set(ignore) | set(always_accept) | |||
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] | |||
lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) | |||
lexer = TraditionalLexer(state_tokens, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) | |||
lexer_by_tokens[key] = lexer | |||
self.lexers[state] = lexer | |||
self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) | |||
self.root_lexer = TraditionalLexer(terminals, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) | |||
def lex(self, stream, get_parser_state): | |||
parser_state = get_parser_state() | |||
@@ -616,7 +616,7 @@ class Grammar: | |||
_imported_grammars = {} | |||
def import_grammar(grammar_path, base_paths=[]): | |||
def import_grammar(grammar_path, re_, base_paths=[]): | |||
if grammar_path not in _imported_grammars: | |||
import_paths = base_paths + IMPORT_PATHS | |||
for import_path in import_paths: | |||
@@ -624,7 +624,7 @@ def import_grammar(grammar_path, base_paths=[]): | |||
joined_path = os.path.join(import_path, grammar_path) | |||
with open(joined_path, encoding='utf8') as f: | |||
text = f.read() | |||
grammar = load_grammar(text, joined_path) | |||
grammar = load_grammar(text, joined_path, re_) | |||
_imported_grammars[grammar_path] = grammar | |||
break | |||
else: | |||
@@ -755,7 +755,8 @@ def _find_used_symbols(tree): | |||
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} | |||
class GrammarLoader: | |||
def __init__(self): | |||
def __init__(self, re_): | |||
self.re = re_ | |||
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] | |||
rules = [options_from_rule(name, None, x) for name, x in RULES.items()] | |||
@@ -764,7 +765,7 @@ class GrammarLoader: | |||
lexer_conf = LexerConf(terminals, ['WS', 'COMMENT']) | |||
parser_conf = ParserConf(rules, callback, ['start']) | |||
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf) | |||
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf, re_) | |||
self.canonize_tree = CanonizeTree() | |||
@@ -862,7 +863,7 @@ class GrammarLoader: | |||
# import grammars | |||
for dotted_path, (base_paths, aliases) in imports.items(): | |||
grammar_path = os.path.join(*dotted_path) + EXT | |||
g = import_grammar(grammar_path, base_paths=base_paths) | |||
g = import_grammar(grammar_path, self.re, base_paths=base_paths) | |||
new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases) | |||
term_defs += new_td | |||
@@ -942,4 +943,5 @@ class GrammarLoader: | |||
load_grammar = GrammarLoader().load_grammar | |||
def load_grammar(grammar, source, re_): | |||
return GrammarLoader(re_).load_grammar(grammar, source) |
@@ -1,7 +1,3 @@ | |||
try: | |||
import regex as re | |||
except ImportError: | |||
import re | |||
from functools import partial | |||
from .utils import get_regexp_width, Serialize | |||
@@ -66,14 +62,16 @@ class WithLexer(_ParserFrontend): | |||
__serialize_fields__ = 'parser', 'lexer_conf', 'start' | |||
__serialize_namespace__ = LexerConf, | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
def __init__(self, lexer_conf, parser_conf, re_, options=None): | |||
self.lexer_conf = lexer_conf | |||
self.start = parser_conf.start | |||
self.postlex = lexer_conf.postlex | |||
self.re = re_ | |||
@classmethod | |||
def deserialize(cls, data, memo, callbacks, postlex): | |||
def deserialize(cls, data, memo, callbacks, postlex, re_): | |||
inst = super(WithLexer, cls).deserialize(data, memo) | |||
inst.re = re_ | |||
inst.postlex = postlex | |||
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) | |||
inst.init_lexer() | |||
@@ -91,13 +89,14 @@ class WithLexer(_ParserFrontend): | |||
return self._parse(token_stream, start) | |||
def init_traditional_lexer(self): | |||
self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags) | |||
self.lexer = TraditionalLexer(self.lexer_conf.tokens, re_=self.re, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags) | |||
class LALR_WithLexer(WithLexer): | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
def __init__(self, lexer_conf, parser_conf, re_, options=None): | |||
debug = options.debug if options else False | |||
self.re = re_ | |||
self.parser = LALR_Parser(parser_conf, debug=debug) | |||
WithLexer.__init__(self, lexer_conf, parser_conf, options) | |||
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) | |||
self.init_lexer() | |||
@@ -113,6 +112,7 @@ class LALR_ContextualLexer(LALR_WithLexer): | |||
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} | |||
always_accept = self.postlex.always_accept if self.postlex else () | |||
self.lexer = ContextualLexer(self.lexer_conf.tokens, states, | |||
re_=self.re, | |||
ignore=self.lexer_conf.ignore, | |||
always_accept=always_accept, | |||
user_callbacks=self.lexer_conf.callbacks, | |||
@@ -129,11 +129,11 @@ class LALR_ContextualLexer(LALR_WithLexer): | |||
###} | |||
class LALR_CustomLexer(LALR_WithLexer): | |||
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): | |||
self.lexer = lexer_cls(lexer_conf) | |||
def __init__(self, lexer_cls, lexer_conf, parser_conf, re_, options=None): | |||
self.lexer = lexer_cls(lexer_conf, re_=re_) | |||
debug = options.debug if options else False | |||
self.parser = LALR_Parser(parser_conf, debug=debug) | |||
WithLexer.__init__(self, lexer_conf, parser_conf, options) | |||
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) | |||
def tokenize_text(text): | |||
@@ -146,8 +146,8 @@ def tokenize_text(text): | |||
yield Token('CHAR', ch, line=line, column=i - col_start_pos) | |||
class Earley(WithLexer): | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
WithLexer.__init__(self, lexer_conf, parser_conf, options) | |||
def __init__(self, lexer_conf, parser_conf, re_, options=None): | |||
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) | |||
self.init_traditional_lexer() | |||
resolve_ambiguity = options.ambiguity == 'resolve' | |||
@@ -159,7 +159,9 @@ class Earley(WithLexer): | |||
class XEarley(_ParserFrontend): | |||
def __init__(self, lexer_conf, parser_conf, options=None, **kw): | |||
def __init__(self, lexer_conf, parser_conf, re_, options=None, **kw): | |||
self.re = re_ | |||
self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||
self.start = parser_conf.start | |||
@@ -191,7 +193,7 @@ class XEarley(_ParserFrontend): | |||
if width == 0: | |||
raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t) | |||
self.regexps[t.name] = re.compile(regexp, lexer_conf.g_regex_flags) | |||
self.regexps[t.name] = self.re.compile(regexp, lexer_conf.g_regex_flags) | |||
def parse(self, text, start): | |||
return self._parse(text, start) | |||
@@ -204,8 +206,8 @@ class XEarley_CompleteLex(XEarley): | |||
class CYK(WithLexer): | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
WithLexer.__init__(self, lexer_conf, parser_conf, options) | |||
def __init__(self, lexer_conf, parser_conf, re_, options=None): | |||
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) | |||
self.init_traditional_lexer() | |||
self._analysis = GrammarAnalyzer(parser_conf) | |||
@@ -551,8 +551,8 @@ class CustomLexer(Lexer): | |||
Purpose of this custom lexer is to test the integration, | |||
so it uses the traditionalparser as implementation without custom lexing behaviour. | |||
""" | |||
def __init__(self, lexer_conf): | |||
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags) | |||
def __init__(self, lexer_conf, re_): | |||
self.lexer = TraditionalLexer(lexer_conf.tokens, re_, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags) | |||
def lex(self, *args, **kwargs): | |||
return self.lexer.lex(*args, **kwargs) | |||
@@ -17,7 +17,7 @@ class TestRegex(unittest.TestCase): | |||
NAME: ID_START ID_CONTINUE* | |||
ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ | |||
ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ | |||
""") | |||
""", regex=True) | |||
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | |||
@@ -26,7 +26,7 @@ class TestRegex(unittest.TestCase): | |||
g = Lark(r""" | |||
?start: NAME | |||
NAME: /[\w]+/ | |||
""") | |||
""", regex=True) | |||
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | |||