@@ -5,5 +5,7 @@ from .visitors import * | |||
from .exceptions import * | |||
from .lexer import * | |||
from .lark import * | |||
from logging import Logger as _Logger | |||
logger: _Logger | |||
__version__: str = ... |
@@ -2,7 +2,7 @@ | |||
from typing import ( | |||
TypeVar, Type, List, Dict, IO, Iterator, Callable, Union, Optional, | |||
Literal, Protocol, Tuple, | |||
Literal, Protocol, Tuple, Iterable, | |||
) | |||
from .visitors import Transformer | |||
from .lexer import Token, Lexer, TerminalDef | |||
@@ -14,6 +14,8 @@ class PostLex(Protocol): | |||
def process(self, stream: Iterator[Token]) -> Iterator[Token]: | |||
... | |||
always_accept: Iterable[str] | |||
class LarkOptions: | |||
@@ -7,4 +7,4 @@ from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken, | |||
from .lexer import Token | |||
from .lark import Lark | |||
__version__ = "0.10.0" | |||
__version__ = "0.10.1" |
@@ -169,6 +169,10 @@ class LarkOptions(Serialize): | |||
return cls(data) | |||
_LOAD_ALLOWED_OPTIONS = {'postlex', 'transformer', 'use_bytes', 'debug', 'g_regex_flags', | |||
'regex', 'propagate_positions', 'keep_all_tokens', 'tree_class'} | |||
class Lark(Serialize): | |||
"""Main interface for the library. | |||
@@ -239,8 +243,11 @@ class Lark(Serialize): | |||
if FS.exists(cache_fn): | |||
logger.debug('Loading grammar from cache: %s', cache_fn) | |||
# Remove options that aren't relevant for loading from cache | |||
for name in (set(options) - _LOAD_ALLOWED_OPTIONS): | |||
del options[name] | |||
with FS.open(cache_fn, 'rb') as f: | |||
self._load(f, self.options.transformer, self.options.postlex) | |||
self._load(f, **options) | |||
return | |||
if self.options.lexer == 'auto': | |||
@@ -278,8 +285,13 @@ class Lark(Serialize): | |||
# Parse the grammar file and compose the grammars (TODO) | |||
self.grammar = load_grammar(grammar, self.source_path, re_module, self.options.import_paths) | |||
if self.options.postlex is not None: | |||
terminals_to_keep = set(self.options.postlex.always_accept) | |||
else: | |||
terminals_to_keep = set() | |||
# Compile the EBNF grammar into BNF | |||
self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) | |||
self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start, terminals_to_keep) | |||
if self.options.edit_terminals: | |||
for t in self.terminals: | |||
@@ -319,7 +331,8 @@ class Lark(Serialize): | |||
with FS.open(cache_fn, 'wb') as f: | |||
self.save(f) | |||
__doc__ += "\n\n" + LarkOptions.OPTIONS_DOC | |||
if __doc__: | |||
__doc__ += "\n\n" + LarkOptions.OPTIONS_DOC | |||
__serialize_fields__ = 'parser', 'rules', 'options' | |||
@@ -345,7 +358,7 @@ class Lark(Serialize): | |||
Useful for caching and multiprocessing. | |||
""" | |||
data, m = self.memo_serialize([TerminalDef, Rule]) | |||
pickle.dump({'data': data, 'memo': m}, f) | |||
pickle.dump({'data': data, 'memo': m}, f, protocol=pickle.HIGHEST_PROTOCOL) | |||
@classmethod | |||
def load(cls, f): | |||
@@ -356,7 +369,7 @@ class Lark(Serialize): | |||
inst = cls.__new__(cls) | |||
return inst._load(f) | |||
def _load(self, f, transformer=None, postlex=None): | |||
def _load(self, f, **kwargs): | |||
if isinstance(f, dict): | |||
d = f | |||
else: | |||
@@ -367,12 +380,11 @@ class Lark(Serialize): | |||
assert memo | |||
memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {}) | |||
options = dict(data['options']) | |||
if transformer is not None: | |||
options['transformer'] = transformer | |||
if postlex is not None: | |||
options['postlex'] = postlex | |||
if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults): | |||
raise ValueError("Some options are not allowed when loading a Parser: {}" | |||
.format(set(kwargs) - _LOAD_ALLOWED_OPTIONS)) | |||
options.update(kwargs) | |||
self.options = LarkOptions.deserialize(options, memo) | |||
re_module = regex if self.options.regex else re | |||
self.rules = [Rule.deserialize(r, memo) for r in data['rules']] | |||
self.source_path = '<deserialized>' | |||
self._prepare_callbacks() | |||
@@ -380,18 +392,16 @@ class Lark(Serialize): | |||
data['parser'], | |||
memo, | |||
self._callbacks, | |||
self.options.postlex, | |||
self.options.transformer, | |||
re_module | |||
self.options, # Not all, but multiple attributes are used | |||
) | |||
self.terminals = self.parser.lexer_conf.tokens | |||
self._terminals_dict = {t.name: t for t in self.terminals} | |||
return self | |||
@classmethod | |||
def _load_from_dict(cls, data, memo, transformer=None, postlex=None): | |||
def _load_from_dict(cls, data, memo, **kwargs): | |||
inst = cls.__new__(cls) | |||
return inst._load({'data': data, 'memo': memo}, transformer, postlex) | |||
return inst._load({'data': data, 'memo': memo}, **kwargs) | |||
@classmethod | |||
def open(cls, grammar_filename, rel_to=None, **options): | |||
@@ -527,7 +527,7 @@ class Grammar: | |||
self.rule_defs = rule_defs | |||
self.ignore = ignore | |||
def compile(self, start): | |||
def compile(self, start, terminals_to_keep): | |||
# We change the trees in-place (to support huge grammars) | |||
# So deepcopy allows calling compile more than once. | |||
term_defs = deepcopy(list(self.term_defs)) | |||
@@ -642,7 +642,7 @@ class Grammar: | |||
used_terms = {t.name for r in compiled_rules | |||
for t in r.expansion | |||
if isinstance(t, Terminal)} | |||
terminals, unused = classify_bool(terminals, lambda t: t.name in used_terms or t.name in self.ignore) | |||
terminals, unused = classify_bool(terminals, lambda t: t.name in used_terms or t.name in self.ignore or t.name in terminals_to_keep) | |||
if unused: | |||
logger.debug("Unused terminals: %s", [t.name for t in unused]) | |||
@@ -6,6 +6,11 @@ from .parsers.lalr_parser import LALR_Parser | |||
from .grammar import Rule | |||
from .tree import Tree | |||
from .common import LexerConf | |||
try: | |||
import regex | |||
except ImportError: | |||
regex = None | |||
import re | |||
###{standalone | |||
@@ -82,16 +87,18 @@ class WithLexer(_ParserFrontend): | |||
self.postlex = lexer_conf.postlex | |||
@classmethod | |||
def deserialize(cls, data, memo, callbacks, postlex, transformer, re_module): | |||
def deserialize(cls, data, memo, callbacks, options): | |||
inst = super(WithLexer, cls).deserialize(data, memo) | |||
inst.postlex = postlex | |||
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) | |||
inst.postlex = options.postlex | |||
inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks, options.debug) | |||
terminals = [item for item in memo.values() if isinstance(item, TerminalDef)] | |||
inst.lexer_conf.callbacks = _get_lexer_callbacks(transformer, terminals) | |||
inst.lexer_conf.re_module = re_module | |||
inst.lexer_conf.skip_validation=True | |||
inst.lexer_conf.callbacks = _get_lexer_callbacks(options.transformer, terminals) | |||
inst.lexer_conf.re_module = regex if options.regex else re | |||
inst.lexer_conf.use_bytes = options.use_bytes | |||
inst.lexer_conf.g_regex_flags = options.g_regex_flags | |||
inst.lexer_conf.skip_validation = True | |||
inst.init_lexer() | |||
return inst | |||
@@ -246,13 +246,14 @@ class LALR_Analyzer(GrammarAnalyzer): | |||
def compute_lalr1_states(self): | |||
m = {} | |||
reduce_reduce = [] | |||
for state in self.lr0_states: | |||
actions = {} | |||
for la, next_state in state.transitions.items(): | |||
actions[la] = (Shift, next_state.closure) | |||
for la, rules in state.lookaheads.items(): | |||
if len(rules) > 1: | |||
raise GrammarError('Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t\t- ' + str(r) for r in rules ]))) | |||
reduce_reduce.append((la, rules)) | |||
if la in actions: | |||
if self.debug: | |||
logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) | |||
@@ -261,6 +262,12 @@ class LALR_Analyzer(GrammarAnalyzer): | |||
actions[la] = (Reduce, list(rules)[0]) | |||
m[state] = { k.name: v for k, v in actions.items() } | |||
if reduce_reduce: | |||
msgs = [ 'Reduce/Reduce collision in %s between the following rules: %s' | |||
% (la, ''.join([ '\n\t\t- ' + str(r) for r in rules ])) | |||
for la, rules in reduce_reduce] | |||
raise GrammarError('\n\n'.join(msgs)) | |||
states = { k.closure: v for k, v in m.items() } | |||
# compute end states | |||
@@ -23,10 +23,10 @@ class LALR_Parser(object): | |||
self.parser = _Parser(analysis.parse_table, callbacks, debug) | |||
@classmethod | |||
def deserialize(cls, data, memo, callbacks): | |||
def deserialize(cls, data, memo, callbacks, debug=False): | |||
inst = cls.__new__(cls) | |||
inst._parse_table = IntParseTable.deserialize(data, memo) | |||
inst.parser = _Parser(inst._parse_table, callbacks) | |||
inst.parser = _Parser(inst._parse_table, callbacks, debug) | |||
return inst | |||
def serialize(self, memo): | |||
@@ -145,8 +145,8 @@ def main(fobj, start, print=print): | |||
print('Shift = 0') | |||
print('Reduce = 1') | |||
print("def Lark_StandAlone(transformer=None, postlex=None):") | |||
print(" return Lark._load_from_dict(DATA, MEMO, transformer=transformer, postlex=postlex)") | |||
print("def Lark_StandAlone(**kwargs):") | |||
print(" return Lark._load_from_dict(DATA, MEMO, **kwargs)") | |||
@@ -81,7 +81,8 @@ class TreeMatcher: | |||
def __init__(self, parser): | |||
# XXX TODO calling compile twice returns different results! | |||
assert parser.options.maybe_placeholders == False | |||
self.tokens, rules, _extra = parser.grammar.compile(parser.options.start) | |||
# XXX TODO: we just ignore the potential existence of a postlexer | |||
self.tokens, rules, _extra = parser.grammar.compile(parser.options.start, set()) | |||
self.rules_for_root = defaultdict(list) | |||
@@ -86,6 +86,12 @@ class TestCache(TestCase): | |||
parser = Lark(g, parser='lalr', lexer=CustomLexer, cache=True) | |||
assert len(mock_fs.files) == 1 | |||
assert parser.parse('a') == Tree('start', []) | |||
# Test options persistence | |||
mock_fs.files = {} | |||
Lark(g, parser="lalr", debug=True, cache=True) | |||
parser = Lark(g, parser="lalr", debug=True, cache=True) | |||
assert parser.options.options['debug'] | |||
finally: | |||
lark_module.FS = fs | |||
@@ -93,6 +99,3 @@ class TestCache(TestCase): | |||
if __name__ == '__main__': | |||
main() | |||
@@ -1782,6 +1782,29 @@ def _make_parser_test(LEXER, PARSER): | |||
%import bad_test.NUMBER | |||
""" | |||
self.assertRaises(IOError, _Lark, grammar) | |||
@unittest.skipIf(LEXER=='dynamic', "%declare/postlex doesn't work with dynamic") | |||
def test_postlex_declare(self): # Note: this test does a lot. maybe split it up? | |||
class TestPostLexer: | |||
def process(self, stream): | |||
for t in stream: | |||
if t.type == 'A': | |||
t.type = 'B' | |||
yield t | |||
else: | |||
yield t | |||
always_accept = ('A',) | |||
parser = _Lark(""" | |||
start: B | |||
A: "A" | |||
%declare B | |||
""", postlex=TestPostLexer()) | |||
test_file = "A" | |||
tree = parser.parse(test_file) | |||
self.assertEqual(tree.children, [Token('B', 'A')]) | |||
def test_import_custom_sources(self): | |||
custom_loader = FromPackageLoader('tests', ('grammars', )) | |||
@@ -25,7 +25,7 @@ class TestStandalone(TestCase): | |||
standalone.main(StringIO(grammar), 'start', print=pr) | |||
code = code_buf.getvalue() | |||
context = {} | |||
context = {'__doc__': None} | |||
exec(code, context) | |||
return context | |||
@@ -20,7 +20,7 @@ class TestTrees(TestCase): | |||
def test_pickle(self): | |||
s = copy.deepcopy(self.tree1) | |||
data = pickle.dumps(s) | |||
data = pickle.dumps(s, protocol=pickle.HIGHEST_PROTOCOL) | |||
assert pickle.loads(data) == s | |||
def test_repr_runnable(self): | |||