Browse Source

Added `regex` module as optional mode.

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.9.0
julienmalard 4 years ago
parent
commit
86a162d6d8
7 changed files with 70 additions and 42 deletions
  1. +2
    -0
      lark-stubs/lark.pyi
  2. +22
    -3
      lark/lark.py
  3. +14
    -11
      lark/lexer.py
  4. +8
    -6
      lark/load_grammar.py
  5. +20
    -18
      lark/parser_frontends.py
  6. +2
    -2
      tests/test_parser.py
  7. +2
    -2
      tests/test_regex.py

+ 2
- 0
lark-stubs/lark.pyi View File

@@ -23,6 +23,7 @@ class LarkOptions:
transformer: Optional[Transformer]
postlex: Optional[PostLex]
ambiguity: str
regex: bool
debug: bool
keep_all_tokens: bool
propagate_positions: bool
@@ -48,6 +49,7 @@ class Lark:
transformer: Optional[Transformer] = None,
postlex: Optional[PostLex] = None,
ambiguity: Literal["explicit", "resolve"] = "resolve",
regex: bool = False,
debug: bool = False,
keep_all_tokens: bool = False,
propagate_positions: bool = False,


+ 22
- 3
lark/lark.py View File

@@ -14,6 +14,12 @@ from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import get_frontend
from .grammar import Rule

import re
try:
import regex
except ImportError:
regex = None

###{standalone

class LarkOptions(Serialize):
@@ -34,6 +40,7 @@ class LarkOptions(Serialize):
When `False`, `[]` behaves like the `?` operator,
and returns no value at all.
(default=`False`. Recommended to set to `True`)
regex - When True, uses the `regex` module instead of the stdlib `re`.
cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading.
LALR only for now.
When `False`, does nothing (default)
@@ -92,6 +99,7 @@ class LarkOptions(Serialize):
'start': 'start',
'priority': 'auto',
'ambiguity': 'auto',
'regex': False,
'propagate_positions': False,
'lexer_callbacks': {},
'maybe_placeholders': False,
@@ -154,6 +162,16 @@ class Lark(Serialize):

self.options = LarkOptions(options)

# Set regex or re module
use_regex = self.options.regex
if use_regex:
if regex:
self.re = regex
else:
raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.')
else:
self.re = re

# Some, but not all file-like objects have a 'name' attribute
try:
self.source = grammar.name
@@ -224,7 +242,7 @@ class Lark(Serialize):
assert self.options.ambiguity in ('resolve', 'explicit', 'auto', )

# Parse the grammar file and compose the grammars (TODO)
self.grammar = load_grammar(grammar, self.source)
self.grammar = load_grammar(grammar, self.source, self.re)

# Compile the EBNF grammar into BNF
self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start)
@@ -285,7 +303,7 @@ class Lark(Serialize):
def _build_parser(self):
self._prepare_callbacks()
parser_conf = ParserConf(self.rules, self._callbacks, self.options.start)
return self.parser_class(self.lexer_conf, parser_conf, options=self.options)
return self.parser_class(self.lexer_conf, parser_conf, self.re, options=self.options)

def save(self, f):
data, m = self.memo_serialize([TerminalDef, Rule])
@@ -312,10 +330,11 @@ class Lark(Serialize):
if postlex is not None:
options['postlex'] = postlex
self.options = LarkOptions.deserialize(options, memo)
self.re = regex if self.options.regex else re
self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
self.source = '<deserialized>'
self._prepare_callbacks()
self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex)
self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, self.re)
return self

@classmethod


+ 14
- 11
lark/lexer.py View File

@@ -1,9 +1,10 @@
## Lexer Implementation

import re
try:
import regex as re
import regex
except ImportError:
import re
regex = None

from .utils import Str, classify, get_regexp_width, Py36, Serialize
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken
@@ -233,7 +234,7 @@ class CallChain:



def _create_unless(terminals, g_regex_flags):
def _create_unless(terminals, g_regex_flags, re_):
tokens_by_type = classify(terminals, lambda t: type(t.pattern))
assert len(tokens_by_type) <= 2, tokens_by_type.keys()
embedded_strs = set()
@@ -244,7 +245,7 @@ def _create_unless(terminals, g_regex_flags):
if strtok.priority > retok.priority:
continue
s = strtok.pattern.value
m = re.match(retok.pattern.to_regexp(), s, g_regex_flags)
m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags)
if m and m.group(0) == s:
unless.append(strtok)
if strtok.pattern.flags <= retok.pattern.flags:
@@ -297,16 +298,17 @@ class Lexer(object):

class TraditionalLexer(Lexer):

def __init__(self, terminals, ignore=(), user_callbacks={}, g_regex_flags=0):
def __init__(self, terminals, re_, ignore=(), user_callbacks={}, g_regex_flags=0):
assert all(isinstance(t, TerminalDef) for t in terminals), terminals

terminals = list(terminals)

self.re = re_
# Sanitization
for t in terminals:
try:
re.compile(t.pattern.to_regexp(), g_regex_flags)
except re.error:
self.re.compile(t.pattern.to_regexp(), g_regex_flags)
except self.re.error:
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))

if t.pattern.min_width == 0:
@@ -324,7 +326,7 @@ class TraditionalLexer(Lexer):
self.build(g_regex_flags)

def build(self, g_regex_flags=0):
terminals, self.callback = _create_unless(self.terminals, g_regex_flags)
terminals, self.callback = _create_unless(self.terminals, g_regex_flags, re_=self.re)
assert all(self.callback.values())

for type_, f in self.user_callbacks.items():
@@ -350,7 +352,8 @@ class TraditionalLexer(Lexer):

class ContextualLexer(Lexer):

def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0):
def __init__(self, terminals, states, re_, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0):
self.re = re_
tokens_by_name = {}
for t in terminals:
assert t.name not in tokens_by_name, t
@@ -365,12 +368,12 @@ class ContextualLexer(Lexer):
except KeyError:
accepts = set(accepts) | set(ignore) | set(always_accept)
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
lexer = TraditionalLexer(state_tokens, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
lexer_by_tokens[key] = lexer

self.lexers[state] = lexer

self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
self.root_lexer = TraditionalLexer(terminals, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)

def lex(self, stream, get_parser_state):
parser_state = get_parser_state()


+ 8
- 6
lark/load_grammar.py View File

@@ -616,7 +616,7 @@ class Grammar:


_imported_grammars = {}
def import_grammar(grammar_path, base_paths=[]):
def import_grammar(grammar_path, re_, base_paths=[]):
if grammar_path not in _imported_grammars:
import_paths = base_paths + IMPORT_PATHS
for import_path in import_paths:
@@ -624,7 +624,7 @@ def import_grammar(grammar_path, base_paths=[]):
joined_path = os.path.join(import_path, grammar_path)
with open(joined_path, encoding='utf8') as f:
text = f.read()
grammar = load_grammar(text, joined_path)
grammar = load_grammar(text, joined_path, re_)
_imported_grammars[grammar_path] = grammar
break
else:
@@ -755,7 +755,8 @@ def _find_used_symbols(tree):
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}

class GrammarLoader:
def __init__(self):
def __init__(self, re_):
self.re = re_
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]

rules = [options_from_rule(name, None, x) for name, x in RULES.items()]
@@ -764,7 +765,7 @@ class GrammarLoader:
lexer_conf = LexerConf(terminals, ['WS', 'COMMENT'])

parser_conf = ParserConf(rules, callback, ['start'])
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf, re_)

self.canonize_tree = CanonizeTree()

@@ -862,7 +863,7 @@ class GrammarLoader:
# import grammars
for dotted_path, (base_paths, aliases) in imports.items():
grammar_path = os.path.join(*dotted_path) + EXT
g = import_grammar(grammar_path, base_paths=base_paths)
g = import_grammar(grammar_path, self.re, base_paths=base_paths)
new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases)

term_defs += new_td
@@ -942,4 +943,5 @@ class GrammarLoader:



load_grammar = GrammarLoader().load_grammar
def load_grammar(grammar, source, re_):
return GrammarLoader(re_).load_grammar(grammar, source)

+ 20
- 18
lark/parser_frontends.py View File

@@ -1,7 +1,3 @@
try:
import regex as re
except ImportError:
import re
from functools import partial

from .utils import get_regexp_width, Serialize
@@ -66,14 +62,16 @@ class WithLexer(_ParserFrontend):
__serialize_fields__ = 'parser', 'lexer_conf', 'start'
__serialize_namespace__ = LexerConf,

def __init__(self, lexer_conf, parser_conf, options=None):
def __init__(self, lexer_conf, parser_conf, re_, options=None):
self.lexer_conf = lexer_conf
self.start = parser_conf.start
self.postlex = lexer_conf.postlex
self.re = re_

@classmethod
def deserialize(cls, data, memo, callbacks, postlex):
def deserialize(cls, data, memo, callbacks, postlex, re_):
inst = super(WithLexer, cls).deserialize(data, memo)
inst.re = re_
inst.postlex = postlex
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks)
inst.init_lexer()
@@ -91,13 +89,14 @@ class WithLexer(_ParserFrontend):
return self._parse(token_stream, start)

def init_traditional_lexer(self):
self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags)
self.lexer = TraditionalLexer(self.lexer_conf.tokens, re_=self.re, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags)

class LALR_WithLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
def __init__(self, lexer_conf, parser_conf, re_, options=None):
debug = options.debug if options else False
self.re = re_
self.parser = LALR_Parser(parser_conf, debug=debug)
WithLexer.__init__(self, lexer_conf, parser_conf, options)
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)

self.init_lexer()

@@ -113,6 +112,7 @@ class LALR_ContextualLexer(LALR_WithLexer):
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
always_accept = self.postlex.always_accept if self.postlex else ()
self.lexer = ContextualLexer(self.lexer_conf.tokens, states,
re_=self.re,
ignore=self.lexer_conf.ignore,
always_accept=always_accept,
user_callbacks=self.lexer_conf.callbacks,
@@ -129,11 +129,11 @@ class LALR_ContextualLexer(LALR_WithLexer):
###}

class LALR_CustomLexer(LALR_WithLexer):
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
self.lexer = lexer_cls(lexer_conf)
def __init__(self, lexer_cls, lexer_conf, parser_conf, re_, options=None):
self.lexer = lexer_cls(lexer_conf, re_=re_)
debug = options.debug if options else False
self.parser = LALR_Parser(parser_conf, debug=debug)
WithLexer.__init__(self, lexer_conf, parser_conf, options)
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)


def tokenize_text(text):
@@ -146,8 +146,8 @@ def tokenize_text(text):
yield Token('CHAR', ch, line=line, column=i - col_start_pos)

class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, options)
def __init__(self, lexer_conf, parser_conf, re_, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)
self.init_traditional_lexer()

resolve_ambiguity = options.ambiguity == 'resolve'
@@ -159,7 +159,9 @@ class Earley(WithLexer):


class XEarley(_ParserFrontend):
def __init__(self, lexer_conf, parser_conf, options=None, **kw):
def __init__(self, lexer_conf, parser_conf, re_, options=None, **kw):
self.re = re_

self.token_by_name = {t.name:t for t in lexer_conf.tokens}
self.start = parser_conf.start

@@ -191,7 +193,7 @@ class XEarley(_ParserFrontend):
if width == 0:
raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t)

self.regexps[t.name] = re.compile(regexp, lexer_conf.g_regex_flags)
self.regexps[t.name] = self.re.compile(regexp, lexer_conf.g_regex_flags)

def parse(self, text, start):
return self._parse(text, start)
@@ -204,8 +206,8 @@ class XEarley_CompleteLex(XEarley):

class CYK(WithLexer):

def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, options)
def __init__(self, lexer_conf, parser_conf, re_, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)
self.init_traditional_lexer()

self._analysis = GrammarAnalyzer(parser_conf)


+ 2
- 2
tests/test_parser.py View File

@@ -551,8 +551,8 @@ class CustomLexer(Lexer):
Purpose of this custom lexer is to test the integration,
so it uses the traditionalparser as implementation without custom lexing behaviour.
"""
def __init__(self, lexer_conf):
self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags)
def __init__(self, lexer_conf, re_):
self.lexer = TraditionalLexer(lexer_conf.tokens, re_, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags)
def lex(self, *args, **kwargs):
return self.lexer.lex(*args, **kwargs)



+ 2
- 2
tests/test_regex.py View File

@@ -17,7 +17,7 @@ class TestRegex(unittest.TestCase):
NAME: ID_START ID_CONTINUE*
ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/
ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/
""")
""", regex=True)

self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')

@@ -26,7 +26,7 @@ class TestRegex(unittest.TestCase):
g = Lark(r"""
?start: NAME
NAME: /[\w]+/
""")
""", regex=True)
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')




Loading…
Cancel
Save