Browse Source

Lark now loads faster

- Refactored lexer interface into LexerConf
- Lexer now compiles regexps only when used (especially useful for ContextualLexer)
- Lexer now doesn't validate on deserialize (noticable speedup)
tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.10.0
Erez Sh 4 years ago
parent
commit
7dc00179e6
7 changed files with 84 additions and 79 deletions
  1. +8
    -12
      examples/python_parser.py
  2. +4
    -2
      lark/common.py
  3. +7
    -7
      lark/lark.py
  4. +39
    -24
      lark/lexer.py
  5. +5
    -5
      lark/load_grammar.py
  6. +18
    -26
      lark/parser_frontends.py
  7. +3
    -3
      tests/test_parser.py

+ 8
- 12
examples/python_parser.py View File

@@ -26,6 +26,13 @@ python_parser2 = Lark.open('python2.lark', parser='lalr', **kwargs)
python_parser3 = Lark.open('python3.lark',parser='lalr', **kwargs)
python_parser2_earley = Lark.open('python2.lark', parser='earley', lexer='standard', **kwargs)

try:
xrange
except NameError:
chosen_parser = python_parser3
else:
chosen_parser = python_parser2


def _read(fn, *args):
kwargs = {'encoding': 'iso-8859-1'}
@@ -42,24 +49,13 @@ def _get_lib_path():
return [x for x in sys.path if x.endswith('%s.%s' % sys.version_info[:2])][0]

def test_python_lib():

path = _get_lib_path()

start = time.time()
files = glob.glob(path+'/*.py')
for f in files:
print( f )
try:
# print list(python_parser.lex(_read(os.path.join(path, f)) + '\n'))
try:
xrange
except NameError:
python_parser3.parse(_read(os.path.join(path, f)) + '\n')
else:
python_parser2.parse(_read(os.path.join(path, f)) + '\n')
except:
print ('At %s' % f)
raise
chosen_parser.parse(_read(os.path.join(path, f)) + '\n')

end = time.time()
print( "test_python_lib (%d files), time: %s secs"%(len(files), end-start) )


+ 4
- 2
lark/common.py View File

@@ -7,12 +7,14 @@ class LexerConf(Serialize):
__serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags'
__serialize_namespace__ = TerminalDef,

def __init__(self, tokens, ignore=(), postlex=None, callbacks=None, g_regex_flags=0):
self.tokens = tokens
def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False):
self.tokens = tokens # TODO should be terminals
self.ignore = ignore
self.postlex = postlex
self.callbacks = callbacks or {}
self.g_regex_flags = g_regex_flags
self.re_module = re_module
self.skip_validation = skip_validation

def _deserialize(self):
self.callbacks = {} # TODO


+ 7
- 7
lark/lark.py View File

@@ -166,11 +166,11 @@ class Lark(Serialize):
use_regex = self.options.regex
if use_regex:
if regex:
self.re = regex
re_module = regex
else:
raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.')
else:
self.re = re
re_module = re

# Some, but not all file-like objects have a 'name' attribute
try:
@@ -243,7 +243,7 @@ class Lark(Serialize):
assert self.options.ambiguity in ('resolve', 'explicit', 'auto', )

# Parse the grammar file and compose the grammars (TODO)
self.grammar = load_grammar(grammar, self.source, self.re)
self.grammar = load_grammar(grammar, self.source, re_module)

# Compile the EBNF grammar into BNF
self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start)
@@ -276,7 +276,7 @@ class Lark(Serialize):
if hasattr(t, term.name):
lexer_callbacks[term.name] = getattr(t, term.name)

self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags)
self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags)

if self.options.parser:
self.parser = self._build_parser()
@@ -304,7 +304,7 @@ class Lark(Serialize):
def _build_parser(self):
self._prepare_callbacks()
parser_conf = ParserConf(self.rules, self._callbacks, self.options.start)
return self.parser_class(self.lexer_conf, parser_conf, self.re, options=self.options)
return self.parser_class(self.lexer_conf, parser_conf, options=self.options)

def save(self, f):
data, m = self.memo_serialize([TerminalDef, Rule])
@@ -331,11 +331,11 @@ class Lark(Serialize):
if postlex is not None:
options['postlex'] = postlex
self.options = LarkOptions.deserialize(options, memo)
self.re = regex if self.options.regex else re
re_module = regex if self.options.regex else re
self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
self.source = '<deserialized>'
self._prepare_callbacks()
self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, self.re)
self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, re_module)
return self

@classmethod


+ 39
- 24
lark/lexer.py View File

@@ -6,6 +6,7 @@ from .utils import Str, classify, get_regexp_width, Py36, Serialize
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken

###{standalone
from copy import copy

class Pattern(Serialize):

@@ -88,7 +89,6 @@ class TerminalDef(Serialize):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)



class Token(Str):
__slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos')

@@ -294,35 +294,39 @@ class Lexer(object):

class TraditionalLexer(Lexer):

def __init__(self, terminals, re_, ignore=(), user_callbacks={}, g_regex_flags=0):
def __init__(self, conf):
terminals = list(conf.tokens)
assert all(isinstance(t, TerminalDef) for t in terminals), terminals

terminals = list(terminals)
self.re = conf.re_module

self.re = re_
# Sanitization
for t in terminals:
try:
self.re.compile(t.pattern.to_regexp(), g_regex_flags)
except self.re.error:
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))
if not conf.skip_validation:
# Sanitization
for t in terminals:
try:
self.re.compile(t.pattern.to_regexp(), conf.g_regex_flags)
except self.re.error:
raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern))

if t.pattern.min_width == 0:
raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))
if t.pattern.min_width == 0:
raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern))

assert set(ignore) <= {t.name for t in terminals}
assert set(conf.ignore) <= {t.name for t in terminals}

# Init
self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())]
self.ignore_types = list(ignore)
self.ignore_types = list(conf.ignore)

terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
self.terminals = terminals
self.user_callbacks = user_callbacks
self.build(g_regex_flags)
self.user_callbacks = conf.callbacks
self.g_regex_flags = conf.g_regex_flags

self._mres = None
# self.build(g_regex_flags)

def build(self, g_regex_flags=0):
terminals, self.callback = _create_unless(self.terminals, g_regex_flags, re_=self.re)
def _build(self):
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re)
assert all(self.callback.values())

for type_, f in self.user_callbacks.items():
@@ -332,7 +336,13 @@ class TraditionalLexer(Lexer):
else:
self.callback[type_] = f

self.mres = build_mres(terminals, g_regex_flags, self.re)
self._mres = build_mres(terminals, self.g_regex_flags, self.re)

@property
def mres(self):
if self._mres is None:
self._build()
return self._mres

def match(self, stream, pos):
for mre, type_from_index in self.mres:
@@ -348,13 +358,15 @@ class TraditionalLexer(Lexer):

class ContextualLexer(Lexer):

def __init__(self, terminals, states, re_, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0):
self.re = re_
def __init__(self, conf, states, always_accept=()):
terminals = list(conf.tokens)
tokens_by_name = {}
for t in terminals:
assert t.name not in tokens_by_name, t
tokens_by_name[t.name] = t

trad_conf = type(conf)(terminals, conf.re_module, conf.ignore, callbacks=conf.callbacks, g_regex_flags=conf.g_regex_flags, skip_validation=conf.skip_validation)

lexer_by_tokens = {}
self.lexers = {}
for state, accepts in states.items():
@@ -362,14 +374,17 @@ class ContextualLexer(Lexer):
try:
lexer = lexer_by_tokens[key]
except KeyError:
accepts = set(accepts) | set(ignore) | set(always_accept)
accepts = set(accepts) | set(conf.ignore) | set(always_accept)
state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
lexer = TraditionalLexer(state_tokens, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
lexer_conf = copy(trad_conf)
lexer_conf.tokens = state_tokens
lexer = TraditionalLexer(lexer_conf)
lexer_by_tokens[key] = lexer

self.lexers[state] = lexer

self.root_lexer = TraditionalLexer(terminals, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags)
assert trad_conf.tokens is terminals
self.root_lexer = TraditionalLexer(trad_conf)

def lex(self, stream, get_parser_state):
parser_state = get_parser_state()


+ 5
- 5
lark/load_grammar.py View File

@@ -755,19 +755,19 @@ def _find_used_symbols(tree):
for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}

class GrammarLoader:
def __init__(self, re_):
self.re = re_
def __init__(self, re_module):
terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]

rules = [options_from_rule(name, None, x) for name, x in RULES.items()]
rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, _p, xs, o in rules for i, x in enumerate(xs)]
callback = ParseTreeBuilder(rules, ST).create_callback()
lexer_conf = LexerConf(terminals, ['WS', 'COMMENT'])
lexer_conf = LexerConf(terminals, re_module, ['WS', 'COMMENT'])

parser_conf = ParserConf(rules, callback, ['start'])
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf, re_)
self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf)

self.canonize_tree = CanonizeTree()
self.re_module = re_module

def load_grammar(self, grammar_text, grammar_name='<?>'):
"Parse grammar_text, verify, and create Grammar object. Display nice messages on error."
@@ -863,7 +863,7 @@ class GrammarLoader:
# import grammars
for dotted_path, (base_paths, aliases) in imports.items():
grammar_path = os.path.join(*dotted_path) + EXT
g = import_grammar(grammar_path, self.re, base_paths=base_paths)
g = import_grammar(grammar_path, self.re_module, base_paths=base_paths)
new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases)

term_defs += new_td


+ 18
- 26
lark/parser_frontends.py View File

@@ -62,18 +62,18 @@ class WithLexer(_ParserFrontend):
__serialize_fields__ = 'parser', 'lexer_conf', 'start'
__serialize_namespace__ = LexerConf,

def __init__(self, lexer_conf, parser_conf, re_, options=None):
def __init__(self, lexer_conf, parser_conf, options=None):
self.lexer_conf = lexer_conf
self.start = parser_conf.start
self.postlex = lexer_conf.postlex
self.re = re_

@classmethod
def deserialize(cls, data, memo, callbacks, postlex, re_):
def deserialize(cls, data, memo, callbacks, postlex, re_module):
inst = super(WithLexer, cls).deserialize(data, memo)
inst.re = re_
inst.postlex = postlex
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks)
inst.lexer_conf.re_module = re_module
inst.lexer_conf.skip_validation=True
inst.init_lexer()
return inst

@@ -89,18 +89,17 @@ class WithLexer(_ParserFrontend):
return self._parse(token_stream, start)

def init_traditional_lexer(self):
self.lexer = TraditionalLexer(self.lexer_conf.tokens, re_=self.re, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags)
self.lexer = TraditionalLexer(self.lexer_conf)

class LALR_WithLexer(WithLexer):
def __init__(self, lexer_conf, parser_conf, re_, options=None):
def __init__(self, lexer_conf, parser_conf, options=None):
debug = options.debug if options else False
self.re = re_
self.parser = LALR_Parser(parser_conf, debug=debug)
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)
WithLexer.__init__(self, lexer_conf, parser_conf, options)

self.init_lexer()

def init_lexer(self):
def init_lexer(self, **kw):
raise NotImplementedError()

class LALR_TraditionalLexer(LALR_WithLexer):
@@ -111,12 +110,7 @@ class LALR_ContextualLexer(LALR_WithLexer):
def init_lexer(self):
states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()}
always_accept = self.postlex.always_accept if self.postlex else ()
self.lexer = ContextualLexer(self.lexer_conf.tokens, states,
re_=self.re,
ignore=self.lexer_conf.ignore,
always_accept=always_accept,
user_callbacks=self.lexer_conf.callbacks,
g_regex_flags=self.lexer_conf.g_regex_flags)
self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept)


def parse(self, text, start=None):
@@ -129,11 +123,11 @@ class LALR_ContextualLexer(LALR_WithLexer):
###}

class LALR_CustomLexer(LALR_WithLexer):
def __init__(self, lexer_cls, lexer_conf, parser_conf, re_, options=None):
self.lexer = lexer_cls(lexer_conf, re_=re_)
def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
self.lexer = lexer_cls(lexer_conf)
debug = options.debug if options else False
self.parser = LALR_Parser(parser_conf, debug=debug)
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)
WithLexer.__init__(self, lexer_conf, parser_conf, options)


def tokenize_text(text):
@@ -146,8 +140,8 @@ def tokenize_text(text):
yield Token('CHAR', ch, line=line, column=i - col_start_pos)

class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf, re_, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)
def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, options)
self.init_traditional_lexer()

resolve_ambiguity = options.ambiguity == 'resolve'
@@ -159,9 +153,7 @@ class Earley(WithLexer):


class XEarley(_ParserFrontend):
def __init__(self, lexer_conf, parser_conf, re_, options=None, **kw):
self.re = re_

def __init__(self, lexer_conf, parser_conf, options=None, **kw):
self.token_by_name = {t.name:t for t in lexer_conf.tokens}
self.start = parser_conf.start

@@ -193,7 +185,7 @@ class XEarley(_ParserFrontend):
if width == 0:
raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t)

self.regexps[t.name] = self.re.compile(regexp, lexer_conf.g_regex_flags)
self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags)

def parse(self, text, start):
return self._parse(text, start)
@@ -206,8 +198,8 @@ class XEarley_CompleteLex(XEarley):

class CYK(WithLexer):

def __init__(self, lexer_conf, parser_conf, re_, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, re_, options)
def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf, parser_conf, options)
self.init_traditional_lexer()

self._analysis = GrammarAnalyzer(parser_conf)


+ 3
- 3
tests/test_parser.py View File

@@ -6,7 +6,7 @@ import unittest
import logging
import os
import sys
from copy import deepcopy
from copy import copy, deepcopy
try:
from cStringIO import StringIO as cStringIO
except ImportError:
@@ -553,8 +553,8 @@ class CustomLexer(Lexer):
Purpose of this custom lexer is to test the integration,
so it uses the traditionalparser as implementation without custom lexing behaviour.
"""
def __init__(self, lexer_conf, re_):
self.lexer = TraditionalLexer(lexer_conf.tokens, re_, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags)
def __init__(self, lexer_conf):
self.lexer = TraditionalLexer(copy(lexer_conf))
def lex(self, *args, **kwargs):
return self.lexer.lex(*args, **kwargs)



Loading…
Cancel
Save