Browse Source

Added 'cache' option to Lark (Issue #479)

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.8.6
Erez Sh 4 years ago
parent
commit
9cc57abd8a
9 changed files with 165 additions and 43 deletions
  1. +4
    -1
      docs/classes.md
  2. +1
    -1
      examples/standalone/json_parser.py
  3. +60
    -27
      lark/lark.py
  4. +1
    -2
      lark/tools/standalone.py
  5. +7
    -3
      lark/utils.py
  6. +1
    -1
      lark_stubs/lark.pyi
  7. +1
    -0
      tests/__main__.py
  8. +82
    -0
      tests/test_cache.py
  9. +8
    -8
      tests/test_parser.py

+ 4
- 1
docs/classes.md View File

@@ -63,7 +63,10 @@ Useful for caching and multiprocessing.

**keep_all_tokens** - Prevent the tree builder from automagically removing "punctuation" tokens (default: False)

**cache_grammar** - Cache the Lark grammar (Default: False)
**cache** - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now.
- When `False`, does nothing (default)
- When `True`, caches to a temporary file in the local directory
- When given a string, caches to the path pointed by the string

#### Algorithm



+ 1
- 1
examples/standalone/json_parser.py View File

@@ -1809,7 +1809,7 @@ class LarkOptions(Serialize):
'debug': False,
'keep_all_tokens': False,
'tree_class': None,
'cache_grammar': False,
'cache: False,
'postlex': None,
'parser': 'earley',
'lexer': 'auto',


+ 60
- 27
lark/lark.py View File

@@ -1,11 +1,10 @@
from __future__ import absolute_import

import os
import sys, os, pickle, hashlib, logging
from io import open
import pickle


from .utils import STRING_TYPE, Serialize, SerializeMemoizer
from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS
from .load_grammar import load_grammar
from .tree import Tree
from .common import LexerConf, ParserConf
@@ -35,7 +34,12 @@ class LarkOptions(Serialize):
When `False`, `[]` behaves like the `?` operator,
and returns no value at all.
(default=`False`. Recommended to set to `True`)
cache_grammar - Cache the Lark grammar (Default: False)
cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading.
LALR only for now.
When `False`, does nothing (default)
When `True`, caches to a temporary file in the local directory
When given a string, caches to the path pointed by the string

g_regex_flags - Flags that are applied to all terminals
(both regex and strings)
keep_all_tokens - Prevent the tree builder from automagically
@@ -80,7 +84,7 @@ class LarkOptions(Serialize):
'debug': False,
'keep_all_tokens': False,
'tree_class': None,
'cache_grammar': False,
'cache': False,
'postlex': None,
'parser': 'earley',
'lexer': 'auto',
@@ -102,7 +106,7 @@ class LarkOptions(Serialize):
for name, default in self._defaults.items():
if name in o:
value = o.pop(name)
if isinstance(default, bool):
if isinstance(default, bool) and name != 'cache':
value = bool(value)
else:
value = default
@@ -147,6 +151,7 @@ class Lark(Serialize):
grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax)
options : a dictionary controlling various aspects of Lark.
"""

self.options = LarkOptions(options)

# Some, but not all file-like objects have a 'name' attribute
@@ -165,8 +170,24 @@ class Lark(Serialize):

assert isinstance(grammar, STRING_TYPE)

if self.options.cache_grammar:
raise NotImplementedError("Not available yet")
cache_fn = None
if self.options.cache:
if isinstance(self.options.cache, STRING_TYPE):
cache_fn = self.options.cache
else:
if self.options.cache is not True:
raise ValueError("cache must be bool or str")
unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
s = grammar + options_str
md5 = hashlib.md5(s.encode()).hexdigest()
cache_fn = '.lark_cache_%s.tmp' % md5

if FS.exists(cache_fn):
logging.debug('Loading grammar from cache: %s', cache_fn)
with FS.open(cache_fn, 'rb') as f:
self._load(f, self.options.transformer, self.options.postlex)
return

if self.options.lexer == 'auto':
if self.options.parser == 'lalr':
@@ -241,6 +262,11 @@ class Lark(Serialize):
elif lexer:
self.lexer = self._build_lexer()

if cache_fn:
logging.debug('Saving grammar to cache: %s', cache_fn)
with FS.open(cache_fn, 'wb') as f:
self.save(f)

if __init__.__doc__:
__init__.__doc__ += "\nOptions:\n" + LarkOptions.OPTIONS_DOC

@@ -259,34 +285,41 @@ class Lark(Serialize):
parser_conf = ParserConf(self.rules, self._callbacks, self.options.start)
return self.parser_class(self.lexer_conf, parser_conf, options=self.options)

def save(self, f):
data, m = self.memo_serialize([TerminalDef, Rule])
pickle.dump({'data': data, 'memo': m}, f)

@classmethod
def deserialize(cls, data, namespace, memo, transformer=None, postlex=None):
if memo:
memo = SerializeMemoizer.deserialize(memo, namespace, {})
def load(cls, f):
inst = cls.__new__(cls)
return inst._load(f)

def _load(self, f, transformer=None, postlex=None):
if isinstance(f, dict):
d = f
else:
d = pickle.load(f)
memo = d['memo']
data = d['data']

assert memo
memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
options = dict(data['options'])
if transformer is not None:
options['transformer'] = transformer
if postlex is not None:
options['postlex'] = postlex
inst.options = LarkOptions.deserialize(options, memo)
inst.rules = [Rule.deserialize(r, memo) for r in data['rules']]
inst.source = '<deserialized>'
inst._prepare_callbacks()
inst.parser = inst.parser_class.deserialize(data['parser'], memo, inst._callbacks, inst.options.postlex)
return inst

def save(self, f):
data, m = self.memo_serialize([TerminalDef, Rule])
pickle.dump({'data': data, 'memo': m}, f)
self.options = LarkOptions.deserialize(options, memo)
self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
self.source = '<deserialized>'
self._prepare_callbacks()
self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex)
return self

@classmethod
def load(cls, f):
d = pickle.load(f)
namespace = {'Rule': Rule, 'TerminalDef': TerminalDef}
memo = d['memo']
return Lark.deserialize(d['data'], namespace, memo)

def _load_from_dict(cls, data, memo, transformer=None, postlex=None):
inst = cls.__new__(cls)
return inst._load({'data': data, 'memo': memo}, transformer, postlex)

@classmethod
def open(cls, grammar_filename, rel_to=None, **options):


+ 1
- 2
lark/tools/standalone.py View File

@@ -106,8 +106,7 @@ def main(fobj, start):
print('Shift = 0')
print('Reduce = 1')
print("def Lark_StandAlone(transformer=None, postlex=None):")
print(" namespace = {'Rule': Rule, 'TerminalDef': TerminalDef}")
print(" return Lark.deserialize(DATA, namespace, MEMO, transformer=transformer, postlex=postlex)")
print(" return Lark._load_from_dict(DATA, MEMO, transformer=transformer, postlex=postlex)")





+ 7
- 3
lark/utils.py View File

@@ -1,4 +1,5 @@
import sys
import os
from functools import reduce
from ast import literal_eval
from collections import deque
@@ -37,9 +38,6 @@ def bfs(initial, expand):


def _serialize(value, memo):
# if memo and memo.in_types(value):
# return {'__memo__': memo.memoized.get(value)}

if isinstance(value, Serialize):
return value.serialize(memo)
elif isinstance(value, list):
@@ -287,3 +285,9 @@ def combine_alternatives(lists):
assert all(l for l in lists), lists
init = [[x] for x in lists[0]]
return reduce(lambda a,b: [i+[j] for i in a for j in b], lists[1:], init)



class FS:
open = open
exists = os.path.exists

+ 1
- 1
lark_stubs/lark.pyi View File

@@ -33,7 +33,7 @@ class LarkOptions:
propagate_positions: bool
maybe_placeholders: bool
lexer_callbacks: Dict[str, Callable[[Token], Token]]
cache_grammar: bool
cache: Union[bool, str]
g_regex_flags: int




+ 1
- 0
tests/__main__.py View File

@@ -5,6 +5,7 @@ import logging

from .test_trees import TestTrees
from .test_tools import TestStandalone
from .test_cache import TestCache
from .test_reconstructor import TestReconstructor

try:


+ 82
- 0
tests/test_cache.py View File

@@ -0,0 +1,82 @@
from __future__ import absolute_import

import sys
from unittest import TestCase, main

from lark import Lark, Tree
import lark.lark as lark_module

try:
from StringIO import StringIO
except ImportError:
from io import BytesIO as StringIO

import tempfile, os


class MockFile(StringIO):
def close(self):
pass
def __enter__(self):
return self
def __exit__(self, *args):
pass

class MockFS:
def __init__(self):
self.files = {}

def open(self, name, mode=None):
if name not in self.files:
f = self.files[name] = MockFile()
else:
f = self.files[name]
f.seek(0)
return f

def exists(self, name):
return name in self.files


class TestCache(TestCase):
def setUp(self):
pass

def test_simple(self):
g = '''start: "a"'''

fn = "bla"

fs = lark_module.FS
mock_fs = MockFS()
try:
lark_module.FS = mock_fs
Lark(g, parser='lalr', cache=fn)
assert fn in mock_fs.files
parser = Lark(g, parser='lalr', cache=fn)
assert parser.parse('a') == Tree('start', [])

mock_fs.files = {}
assert len(mock_fs.files) == 0
Lark(g, parser='lalr', cache=True)
assert len(mock_fs.files) == 1
parser = Lark(g, parser='lalr', cache=True)
assert parser.parse('a') == Tree('start', [])

parser = Lark(g+' "b"', parser='lalr', cache=True)
assert len(mock_fs.files) == 2
assert parser.parse('ab') == Tree('start', [])

parser = Lark(g, parser='lalr', cache=True)
assert parser.parse('a') == Tree('start', [])

finally:
lark_module.FS = fs



if __name__ == '__main__':
main()




+ 8
- 8
tests/test_parser.py View File

@@ -14,6 +14,7 @@ except ImportError:
cStringIO = None
from io import (
StringIO as uStringIO,
BytesIO,
open,
)

@@ -26,6 +27,8 @@ from lark.visitors import Transformer, Transformer_InPlace, v_args
from lark.grammar import Rule
from lark.lexer import TerminalDef, Lexer, TraditionalLexer



__path__ = os.path.dirname(__file__)
def _read(n, *args):
with open(os.path.join(__path__, n), *args) as f:
@@ -873,7 +876,7 @@ def _make_parser_test(LEXER, PARSER):
self.assertSequenceEqual(x.children, [Tree('expr', [])])
x = g.parse("BC")
self.assertSequenceEqual(x.children, [Tree('b', [])])
def test_templates_modifiers(self):
g = _Lark(r"""
start: expr{"B"}
@@ -1736,15 +1739,12 @@ def _make_parser_test(LEXER, PARSER):
b: "B"
"""
parser = _Lark(grammar)
d = parser.serialize()
parser2 = Lark.deserialize(d, {}, {})
s = BytesIO()
parser.save(s)
s.seek(0)
parser2 = Lark.load(s)
self.assertEqual(parser2.parse('ABC'), Tree('start', [Tree('b', [])]) )

namespace = {'Rule': Rule, 'TerminalDef': TerminalDef}
d, m = parser.memo_serialize(namespace.values())
parser3 = Lark.deserialize(d, namespace, m)
self.assertEqual(parser3.parse('ABC'), Tree('start', [Tree('b', [])]) )

def test_multi_start(self):
parser = _Lark('''
a: "x" "a"?


Loading…
Cancel
Save