Browse Source

Added `regex` module as optional mode.

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.9.0
julienmalard 4 years ago
parent
commit
86a162d6d8
7 changed files with 70 additions and 42 deletions
  1. +2
    -0
      lark-stubs/lark.pyi
  2. +22
    -3
      lark/lark.py
  3. +14
    -11
      lark/lexer.py
  4. +8
    -6
      lark/load_grammar.py
  5. +20
    -18
      lark/parser_frontends.py
  6. +2
    -2
      tests/test_parser.py
  7. +2
    -2
      tests/test_regex.py

+ 2
- 0
lark-stubs/lark.pyi View File

@@ -23,6 +23,7 @@ class LarkOptions:
transformer: Optional[Transformer] transformer: Optional[Transformer]
postlex: Optional[PostLex] postlex: Optional[PostLex]
ambiguity: str ambiguity: str
regex: bool
debug: bool debug: bool
keep_all_tokens: bool keep_all_tokens: bool
propagate_positions: bool propagate_positions: bool
@@ -48,6 +49,7 @@ class Lark:
transformer: Optional[Transformer] = None, transformer: Optional[Transformer] = None,
postlex: Optional[PostLex] = None, postlex: Optional[PostLex] = None,
ambiguity: Literal["explicit", "resolve"] = "resolve", ambiguity: Literal["explicit", "resolve"] = "resolve",
regex: bool = False,
debug: bool = False, debug: bool = False,
keep_all_tokens: bool = False, keep_all_tokens: bool = False,
propagate_positions: bool = False, propagate_positions: bool = False,


+ 22
- 3
lark/lark.py View File

@@ -14,6 +14,12 @@ from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import get_frontend from .parser_frontends import get_frontend
from .grammar import Rule from .grammar import Rule


import re
try:
import regex
except ImportError:
regex = None

###{standalone ###{standalone


class LarkOptions(Serialize): class LarkOptions(Serialize):
@@ -34,6 +40,7 @@ class LarkOptions(Serialize):
When `False`, `[]` behaves like the `?` operator, When `False`, `[]` behaves like the `?` operator,
and returns no value at all. and returns no value at all.
(default=`False`. Recommended to set to `True`) (default=`False`. Recommended to set to `True`)
regex - When True, uses the `regex` module instead of the stdlib `re`.
cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading.
LALR only for now. LALR only for now.
When `False`, does nothing (default) When `False`, does nothing (default)
@@ -92,6 +99,7 @@ class LarkOptions(Serialize):
'start': 'start', 'start': 'start',
'priority': 'auto', 'priority': 'auto',
'ambiguity': 'auto', 'ambiguity': 'auto',
'regex': False,
'propagate_positions': False, 'propagate_positions': False,
'lexer_callbacks': {}, 'lexer_callbacks': {},
'maybe_placeholders': False, 'maybe_placeholders': False,
@@ -154,6 +162,16 @@ class Lark(Serialize):


self.options = LarkOptions(options) self.options = LarkOptions(options)


# Set regex or re module
use_regex = self.options.regex
if use_regex:
if regex:
self.re = regex
else:
raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.')
else:
self.re = re

# Some, but not all file-like objects have a 'name' attribute # Some, but not all file-like objects have a 'name' attribute
try: try:
self.source = grammar.name self.source = grammar.name
@@ -224,7 +242,7 @@ class Lark(Serialize):
assert self.options.ambiguity in ('resolve', 'explicit', 'auto', ) assert self.options.ambiguity in ('resolve', 'explicit', 'auto', )


# Parse the grammar file and compose the grammars (TODO) # Parse the grammar file and compose the grammars (TODO)
self.grammar = load_grammar(grammar, self.source)
self.grammar = load_grammar(grammar, self.source, self.re)


# Compile the EBNF grammar into BNF # Compile the EBNF grammar into BNF
self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start)
@@ -285,7 +303,7 @@ class Lark(Serialize):
def _build_parser(self): def _build_parser(self):
self._prepare_callbacks() self._prepare_callbacks()
parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) parser_conf = ParserConf(self.rules, self._callbacks, self.options.start)
return self.parser_class(self.lexer_conf, parser_conf, options=self.options)
return self.parser_class(self.lexer_conf, parser_conf, self.re, options=self.options)


def save(self, f): def save(self, f):
data, m = self.memo_serialize([TerminalDef, Rule]) data, m = self.memo_serialize([TerminalDef, Rule])
@@ -312,10 +330,11 @@ class Lark(Serialize):
if postlex is not None: if postlex is not None:
options['postlex'] = postlex options['postlex'] = postlex
self.options = LarkOptions.deserialize(options, memo) self.options = LarkOptions.deserialize(options, memo)
self.re = regex if self.options.regex else re
self.rules = [Rule.deserialize(r, memo) for r in data['rules']] self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
self.source = '<deserialized>' self.source = '<deserialized>'
self._prepare_callbacks() self._prepare_callbacks()
self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex)
self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, self.re)
return self return self


@classmethod @classmethod


+ 14
- 11
lark/lexer.py View File

@@ -1,9 +1,10 @@
## Lexer Implementation ## Lexer Implementation


import re
try: try:
import regex as re
import regex
except ImportError: except ImportError:
import re
regex = None


from .utils import Str, classify, get_regexp_width, Py36, Serialize from .utils import Str, classify, get_regexp_width, Py36, Serialize
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken
@@ -233,7 +234,7 @@ class CallChain:






def _create_unless(terminals, g_regex_flags):
def _create_unless(terminals, g_regex_flags, re_):
tokens_by_type = classify(terminals, lambda t: type(t.pattern)) tokens_by_type = classify(terminals, lambda t: type(t.pattern))
assert len(tokens_by_type) <= 2, tokens_by_type.keys() assert len(tokens_by_type) <= 2, tokens_by_type.keys()
embedded_strs = set() embedded_strs = set()
@@ -244,7 +245,7 @@ def _create_unless(terminals, g_regex_flags):
if strtok.priority > retok.priority: if strtok.priority > retok.priority:
continue continue
s = strtok.pattern.value s = strtok.pattern.value
m = re.match(retok.pattern.to_regexp(), s, g_regex_flags)
m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags)
if m and m.group(0) == s: if m and m.group(0) == s:
unless.append(strtok) unless.append(strtok)
if strtok.pattern.flags <= retok.pattern.flags: if strtok.pattern.flags <= retok.pattern.flags:
@@ -297,16 +298,17 @@ class Lexer(object):


class TraditionalLexer(Lexer): class TraditionalLexer(Lexer):


def __init__(self, terminals, ignore=(), user_callbacks={}, g_regex_flags=0):
def __init__(self, terminals, re_, ignore=(), user_callbacks={}, g_regex_flags=0):
assert all(isinstance(t, TerminalDef) for t in terminals), terminals assert all(isinstance(t, TerminalDef) for t in terminals), terminals


terminals = list(terminals) terminals = list(terminals)


self.re = re_
# Sanitization # Sanitization
for t in terminals: for t in terminals:
try: try:
re.compile(t.pattern.to_regexp(), g_regex_flags)
except re.error:
self.re.compile(t.pattern.to_regexp(), g_regex_flags)
except self.re.error:
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))


if t.pattern.min_width == 0: if t.pattern.min_width == 0:
@@ -324,7 +326,7 @@ class TraditionalLexer(Lexer):
self.build(g_regex_flags) self.build(g_regex_flags)


def build(self, g_regex_flags=0): def build(self, g_regex_flags=0):
terminals, self.callback = _create_unless(self.terminals, g_regex_flags)
terminals, self.callback = _create_unless(self.terminals, g_regex_flags, re_=self.re)
assert all(self.callback.values()) assert all(self.callback.values())


for type_, f in self.user_callbacks.items(): for type_, f in self.user_callbacks.items():
@@ -350,7 +352,8 @@ class TraditionalLexer(Lexer):


class ContextualLexer(Lexer): class ContextualLexer(Lexer):


def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0):
def __init__(self, terminals, states, re_, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0):
self.re = re_
tokens_by_name = {} tokens_by_name = {}
for t in terminals: for t in terminals:
assert t.name not in tokens_by_name, t assert t.name not in tokens_by_name, t
@@ -365,12 +368,12 @@ class ContextualLexer(Lexer):
except KeyError: except KeyError:
accepts = set(accepts) | set(ignore) | set(always_accept) accepts = set(accepts) | set(ignore) | set(always_accept)
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
lexer = TraditionalLexer(state_tokens, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
lexer_by_tokens[key] = lexer lexer_by_tokens[key] = lexer


self.lexers[state] = lexer self.lexers[state] = lexer


self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
self.root_lexer = TraditionalLexer(terminals, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)


def lex(self, stream, get_parser_state): def lex(self, stream, get_parser_state):
parser_state = get_parser_state() parser_state = get_parser_state()


+ 8
- 6
lark/load_grammar.py View File

@@ -616,7 +616,7 @@ class Grammar:




_imported_grammars = {} _imported_grammars = {}
def import_grammar(grammar_path, base_paths=[]):
def import_grammar(grammar_path, re_, base_paths=[]):
if grammar_path not in _imported_grammars: if grammar_path not in _imported_grammars:
import_paths = base_paths + IMPORT_PATHS import_paths = base_paths + IMPORT_PATHS
for import_path in import_paths: for import_path in import_paths:
@@ -624,7 +624,7 @@ def import_grammar(grammar_path, base_paths=[]):
joined_path = os.path.join(import_path, grammar_path) joined_path = os.path.join(import_path, grammar_path)
with open(joined_path, encoding='utf8') as f: with open(joined_path, encoding='utf8') as f:
text = f.read() text = f.read()
grammar = load_grammar(text, joined_path)
grammar = load_grammar(text, joined_path, re_)
_imported_grammars[grammar_path] = grammar _imported_grammars[grammar_path] = grammar
break break
else: else:
@@ -755,7 +755,8 @@ def _find_used_symbols(tree):
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}


class GrammarLoader: class GrammarLoader:
def __init__(self):
def __init__(self, re_):
self.re = re_
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]


rules = [options_from_rule(name, None, x) for name, x in RULES.items()] rules = [options_from_rule(name, None, x) for name, x in RULES.items()]
@@ -764,7 +765,7 @@ class GrammarLoader:
lexer_conf = LexerConf(terminals, ['WS', 'COMMENT']) lexer_conf = LexerConf(terminals, ['WS', 'COMMENT'])


parser_conf = ParserConf(rules, callback, ['start']) parser_conf = ParserConf(rules, callback, ['start'])
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf, re_)


self.canonize_tree = CanonizeTree() self.canonize_tree = CanonizeTree()


@@ -862,7 +863,7 @@ class GrammarLoader:
# import grammars # import grammars
for dotted_path, (base_paths, aliases) in imports.items(): for dotted_path, (base_paths, aliases) in imports.items():
grammar_path = os.path.join(*dotted_path) + EXT grammar_path = os.path.join(*dotted_path) + EXT
g = import_grammar(grammar_path, base_paths=base_paths)
g = import_grammar(grammar_path, self.re, base_paths=base_paths)
new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases) new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases)


term_defs += new_td term_defs += new_td
@@ -942,4 +943,5 @@ class GrammarLoader:






load_grammar = GrammarLoader().load_grammar
def load_grammar(grammar, source, re_):
return GrammarLoader(re_).load_grammar(grammar, source)

+ 20
- 18
lark/parser_frontends.py View File

@@ -1,7 +1,3 @@
try:
import regex as re
except ImportError:
import re
from functools import partial from functools import partial


from .utils import get_regexp_width, Serialize from .utils import get_regexp_width, Serialize
@@ -66,14 +62,16 @@ class WithLexer(_ParserFrontend):
__serialize_fields__ = 'parser', 'lexer_conf', 'start' __serialize_fields__ = 'parser', 'lexer_conf', 'start'
__serialize_namespace__ = LexerConf, __serialize_namespace__ = LexerConf,


def __init__(self, lexer_conf, parser_conf, options=None):
def __init__(self, lexer_conf, parser_conf, re_, options=None):
self.lexer_conf = lexer_conf self.lexer_conf = lexer_conf
self.start = parser_conf.start self.start = parser_conf.start
self.postlex = lexer_conf.postlex self.postlex = lexer_conf.postlex
self.re = re_


@classmethod @classmethod
def deserialize(cls, data, memo, callbacks, postlex):
def deserialize(cls, data, memo, callbacks, postlex, re_):
inst = super(WithLexer, cls).deserialize(data, memo) inst = super(WithLexer, cls).deserialize(data, memo)
inst.re = re_
inst.postlex = postlex inst.postlex = postlex
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks)
inst.init_lexer() inst.init_lexer()
@@ -91,13 +89,14 @@ class WithLexer(_ParserFrontend):
return self._parse(token_stream, start) return self._parse(token_stream, start)


def init_traditional_lexer(self): def init_traditional_lexer(self):
self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags)
self.lexer = TraditionalLexer(self.lexer_conf.tokens, re_=self.re, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags)


class LALR_WithLexer(WithLexer): class LALR_WithLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
def __init__(self, lexer_conf, parser_conf, re_, options=None):
debug = options.debug if options else False debug = options.debug if options else False
self.re = re_
self.parser = LALR_Parser(parser_conf, debug=debug) self.parser = LALR_Parser(parser_conf, debug=debug)
WithLexer.__init__(self, lexer_conf, parser_conf, options)
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)


self.init_lexer() self.init_lexer()


@@ -113,6 +112,7 @@ class LALR_ContextualLexer(LALR_WithLexer):
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
always_accept = self.postlex.always_accept if self.postlex else () always_accept = self.postlex.always_accept if self.postlex else ()
self.lexer = ContextualLexer(self.lexer_conf.tokens, states, self.lexer = ContextualLexer(self.lexer_conf.tokens, states,
re_=self.re,
ignore=self.lexer_conf.ignore, ignore=self.lexer_conf.ignore,
always_accept=always_accept, always_accept=always_accept,
user_callbacks=self.lexer_conf.callbacks, user_callbacks=self.lexer_conf.callbacks,
@@ -129,11 +129,11 @@ class LALR_ContextualLexer(LALR_WithLexer):
###} ###}


class LALR_CustomLexer(LALR_WithLexer): class LALR_CustomLexer(LALR_WithLexer):
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
self.lexer = lexer_cls(lexer_conf)
def __init__(self, lexer_cls, lexer_conf, parser_conf, re_, options=None):
self.lexer = lexer_cls(lexer_conf, re_=re_)
debug = options.debug if options else False debug = options.debug if options else False
self.parser = LALR_Parser(parser_conf, debug=debug) self.parser = LALR_Parser(parser_conf, debug=debug)
WithLexer.__init__(self, lexer_conf, parser_conf, options)
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)




def tokenize_text(text): def tokenize_text(text):
@@ -146,8 +146,8 @@ def tokenize_text(text):
yield Token('CHAR', ch, line=line, column=i - col_start_pos) yield Token('CHAR', ch, line=line, column=i - col_start_pos)


class Earley(WithLexer): class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, options)
def __init__(self, lexer_conf, parser_conf, re_, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)
self.init_traditional_lexer() self.init_traditional_lexer()


resolve_ambiguity = options.ambiguity == 'resolve' resolve_ambiguity = options.ambiguity == 'resolve'
@@ -159,7 +159,9 @@ class Earley(WithLexer):




class XEarley(_ParserFrontend): class XEarley(_ParserFrontend):
def __init__(self, lexer_conf, parser_conf, options=None, **kw):
def __init__(self, lexer_conf, parser_conf, re_, options=None, **kw):
self.re = re_

self.token_by_name = {t.name:t for t in lexer_conf.tokens} self.token_by_name = {t.name:t for t in lexer_conf.tokens}
self.start = parser_conf.start self.start = parser_conf.start


@@ -191,7 +193,7 @@ class XEarley(_ParserFrontend):
if width == 0: if width == 0:
raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t) raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t)


self.regexps[t.name] = re.compile(regexp, lexer_conf.g_regex_flags)
self.regexps[t.name] = self.re.compile(regexp, lexer_conf.g_regex_flags)


def parse(self, text, start): def parse(self, text, start):
return self._parse(text, start) return self._parse(text, start)
@@ -204,8 +206,8 @@ class XEarley_CompleteLex(XEarley):


class CYK(WithLexer): class CYK(WithLexer):


def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, options)
def __init__(self, lexer_conf, parser_conf, re_, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)
self.init_traditional_lexer() self.init_traditional_lexer()


self._analysis = GrammarAnalyzer(parser_conf) self._analysis = GrammarAnalyzer(parser_conf)


+ 2
- 2
tests/test_parser.py View File

@@ -551,8 +551,8 @@ class CustomLexer(Lexer):
Purpose of this custom lexer is to test the integration, Purpose of this custom lexer is to test the integration,
so it uses the traditionalparser as implementation without custom lexing behaviour. so it uses the traditionalparser as implementation without custom lexing behaviour.
""" """
def __init__(self, lexer_conf):
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags)
def __init__(self, lexer_conf, re_):
self.lexer = TraditionalLexer(lexer_conf.tokens, re_, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags)
def lex(self, *args, **kwargs): def lex(self, *args, **kwargs):
return self.lexer.lex(*args, **kwargs) return self.lexer.lex(*args, **kwargs)




+ 2
- 2
tests/test_regex.py View File

@@ -17,7 +17,7 @@ class TestRegex(unittest.TestCase):
NAME: ID_START ID_CONTINUE* NAME: ID_START ID_CONTINUE*
ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/
ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/
""")
""", regex=True)


self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')


@@ -26,7 +26,7 @@ class TestRegex(unittest.TestCase):
g = Lark(r""" g = Lark(r"""
?start: NAME ?start: NAME
NAME: /[\w]+/ NAME: /[\w]+/
""")
""", regex=True)
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')






Loading…
Cancel
Save