diff --git a/lark-stubs/__init__.pyi b/lark-stubs/__init__.pyi index 7545f2d..c010a93 100644 --- a/lark-stubs/__init__.pyi +++ b/lark-stubs/__init__.pyi @@ -5,5 +5,7 @@ from .visitors import * from .exceptions import * from .lexer import * from .lark import * +from logging import Logger as _Logger +logger: _Logger __version__: str = ... diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index 8f373e8..79437ff 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -2,7 +2,7 @@ from typing import ( TypeVar, Type, List, Dict, IO, Iterator, Callable, Union, Optional, - Literal, Protocol, Tuple, + Literal, Protocol, Tuple, Iterable, ) from .visitors import Transformer from .lexer import Token, Lexer, TerminalDef @@ -14,6 +14,8 @@ class PostLex(Protocol): def process(self, stream: Iterator[Token]) -> Iterator[Token]: ... + + always_accept: Iterable[str] class LarkOptions: diff --git a/lark/__init__.py b/lark/__init__.py index 33a68cb..0e0ba4d 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -7,4 +7,4 @@ from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken, from .lexer import Token from .lark import Lark -__version__ = "0.10.0" +__version__ = "0.10.1" diff --git a/lark/lark.py b/lark/lark.py index 3ceddcb..f93c37b 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -169,6 +169,10 @@ class LarkOptions(Serialize): return cls(data) +_LOAD_ALLOWED_OPTIONS = {'postlex', 'transformer', 'use_bytes', 'debug', 'g_regex_flags', + 'regex', 'propagate_positions', 'keep_all_tokens', 'tree_class'} + + class Lark(Serialize): """Main interface for the library. @@ -239,8 +243,11 @@ class Lark(Serialize): if FS.exists(cache_fn): logger.debug('Loading grammar from cache: %s', cache_fn) + # Remove options that aren't relevant for loading from cache + for name in (set(options) - _LOAD_ALLOWED_OPTIONS): + del options[name] with FS.open(cache_fn, 'rb') as f: - self._load(f, self.options.transformer, self.options.postlex) + self._load(f, **options) return if self.options.lexer == 'auto': @@ -278,8 +285,13 @@ class Lark(Serialize): # Parse the grammar file and compose the grammars (TODO) self.grammar = load_grammar(grammar, self.source_path, re_module, self.options.import_paths) + if self.options.postlex is not None: + terminals_to_keep = set(self.options.postlex.always_accept) + else: + terminals_to_keep = set() + # Compile the EBNF grammar into BNF - self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) + self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start, terminals_to_keep) if self.options.edit_terminals: for t in self.terminals: @@ -319,7 +331,8 @@ class Lark(Serialize): with FS.open(cache_fn, 'wb') as f: self.save(f) - __doc__ += "\n\n" + LarkOptions.OPTIONS_DOC + if __doc__: + __doc__ += "\n\n" + LarkOptions.OPTIONS_DOC __serialize_fields__ = 'parser', 'rules', 'options' @@ -345,7 +358,7 @@ class Lark(Serialize): Useful for caching and multiprocessing. """ data, m = self.memo_serialize([TerminalDef, Rule]) - pickle.dump({'data': data, 'memo': m}, f) + pickle.dump({'data': data, 'memo': m}, f, protocol=pickle.HIGHEST_PROTOCOL) @classmethod def load(cls, f): @@ -356,7 +369,7 @@ class Lark(Serialize): inst = cls.__new__(cls) return inst._load(f) - def _load(self, f, transformer=None, postlex=None): + def _load(self, f, **kwargs): if isinstance(f, dict): d = f else: @@ -367,12 +380,11 @@ class Lark(Serialize): assert memo memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {}) options = dict(data['options']) - if transformer is not None: - options['transformer'] = transformer - if postlex is not None: - options['postlex'] = postlex + if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults): + raise ValueError("Some options are not allowed when loading a Parser: {}" + .format(set(kwargs) - _LOAD_ALLOWED_OPTIONS)) + options.update(kwargs) self.options = LarkOptions.deserialize(options, memo) - re_module = regex if self.options.regex else re self.rules = [Rule.deserialize(r, memo) for r in data['rules']] self.source_path = '' self._prepare_callbacks() @@ -380,18 +392,16 @@ class Lark(Serialize): data['parser'], memo, self._callbacks, - self.options.postlex, - self.options.transformer, - re_module + self.options, # Not all, but multiple attributes are used ) self.terminals = self.parser.lexer_conf.tokens self._terminals_dict = {t.name: t for t in self.terminals} return self @classmethod - def _load_from_dict(cls, data, memo, transformer=None, postlex=None): + def _load_from_dict(cls, data, memo, **kwargs): inst = cls.__new__(cls) - return inst._load({'data': data, 'memo': memo}, transformer, postlex) + return inst._load({'data': data, 'memo': memo}, **kwargs) @classmethod def open(cls, grammar_filename, rel_to=None, **options): diff --git a/lark/load_grammar.py b/lark/load_grammar.py index dfd0f11..4c0a339 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -527,7 +527,7 @@ class Grammar: self.rule_defs = rule_defs self.ignore = ignore - def compile(self, start): + def compile(self, start, terminals_to_keep): # We change the trees in-place (to support huge grammars) # So deepcopy allows calling compile more than once. term_defs = deepcopy(list(self.term_defs)) @@ -642,7 +642,7 @@ class Grammar: used_terms = {t.name for r in compiled_rules for t in r.expansion if isinstance(t, Terminal)} - terminals, unused = classify_bool(terminals, lambda t: t.name in used_terms or t.name in self.ignore) + terminals, unused = classify_bool(terminals, lambda t: t.name in used_terms or t.name in self.ignore or t.name in terminals_to_keep) if unused: logger.debug("Unused terminals: %s", [t.name for t in unused]) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index eb2b615..926603c 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -6,6 +6,11 @@ from .parsers.lalr_parser import LALR_Parser from .grammar import Rule from .tree import Tree from .common import LexerConf +try: + import regex +except ImportError: + regex = None +import re ###{standalone @@ -82,16 +87,18 @@ class WithLexer(_ParserFrontend): self.postlex = lexer_conf.postlex @classmethod - def deserialize(cls, data, memo, callbacks, postlex, transformer, re_module): + def deserialize(cls, data, memo, callbacks, options): inst = super(WithLexer, cls).deserialize(data, memo) - inst.postlex = postlex - inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) + inst.postlex = options.postlex + inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks, options.debug) terminals = [item for item in memo.values() if isinstance(item, TerminalDef)] - inst.lexer_conf.callbacks = _get_lexer_callbacks(transformer, terminals) - inst.lexer_conf.re_module = re_module - inst.lexer_conf.skip_validation=True + inst.lexer_conf.callbacks = _get_lexer_callbacks(options.transformer, terminals) + inst.lexer_conf.re_module = regex if options.regex else re + inst.lexer_conf.use_bytes = options.use_bytes + inst.lexer_conf.g_regex_flags = options.g_regex_flags + inst.lexer_conf.skip_validation = True inst.init_lexer() return inst diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 7a94b4d..8745f46 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -246,13 +246,14 @@ class LALR_Analyzer(GrammarAnalyzer): def compute_lalr1_states(self): m = {} + reduce_reduce = [] for state in self.lr0_states: actions = {} for la, next_state in state.transitions.items(): actions[la] = (Shift, next_state.closure) for la, rules in state.lookaheads.items(): if len(rules) > 1: - raise GrammarError('Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t\t- ' + str(r) for r in rules ]))) + reduce_reduce.append((la, rules)) if la in actions: if self.debug: logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) @@ -261,6 +262,12 @@ class LALR_Analyzer(GrammarAnalyzer): actions[la] = (Reduce, list(rules)[0]) m[state] = { k.name: v for k, v in actions.items() } + if reduce_reduce: + msgs = [ 'Reduce/Reduce collision in %s between the following rules: %s' + % (la, ''.join([ '\n\t\t- ' + str(r) for r in rules ])) + for la, rules in reduce_reduce] + raise GrammarError('\n\n'.join(msgs)) + states = { k.closure: v for k, v in m.items() } # compute end states diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index cf6a4bf..433f3ef 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -23,10 +23,10 @@ class LALR_Parser(object): self.parser = _Parser(analysis.parse_table, callbacks, debug) @classmethod - def deserialize(cls, data, memo, callbacks): + def deserialize(cls, data, memo, callbacks, debug=False): inst = cls.__new__(cls) inst._parse_table = IntParseTable.deserialize(data, memo) - inst.parser = _Parser(inst._parse_table, callbacks) + inst.parser = _Parser(inst._parse_table, callbacks, debug) return inst def serialize(self, memo): diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 7993be0..f2af015 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -145,8 +145,8 @@ def main(fobj, start, print=print): print('Shift = 0') print('Reduce = 1') - print("def Lark_StandAlone(transformer=None, postlex=None):") - print(" return Lark._load_from_dict(DATA, MEMO, transformer=transformer, postlex=postlex)") + print("def Lark_StandAlone(**kwargs):") + print(" return Lark._load_from_dict(DATA, MEMO, **kwargs)") diff --git a/lark/tree_matcher.py b/lark/tree_matcher.py index b9306c4..8c1f17a 100644 --- a/lark/tree_matcher.py +++ b/lark/tree_matcher.py @@ -81,7 +81,8 @@ class TreeMatcher: def __init__(self, parser): # XXX TODO calling compile twice returns different results! assert parser.options.maybe_placeholders == False - self.tokens, rules, _extra = parser.grammar.compile(parser.options.start) + # XXX TODO: we just ignore the potential existence of a postlexer + self.tokens, rules, _extra = parser.grammar.compile(parser.options.start, set()) self.rules_for_root = defaultdict(list) diff --git a/tests/test_cache.py b/tests/test_cache.py index ca4d781..4a07d7a 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -86,6 +86,12 @@ class TestCache(TestCase): parser = Lark(g, parser='lalr', lexer=CustomLexer, cache=True) assert len(mock_fs.files) == 1 assert parser.parse('a') == Tree('start', []) + + # Test options persistence + mock_fs.files = {} + Lark(g, parser="lalr", debug=True, cache=True) + parser = Lark(g, parser="lalr", debug=True, cache=True) + assert parser.options.options['debug'] finally: lark_module.FS = fs @@ -93,6 +99,3 @@ class TestCache(TestCase): if __name__ == '__main__': main() - - - diff --git a/tests/test_parser.py b/tests/test_parser.py index 6aaee4d..2e8ddf7 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1782,6 +1782,29 @@ def _make_parser_test(LEXER, PARSER): %import bad_test.NUMBER """ self.assertRaises(IOError, _Lark, grammar) + + @unittest.skipIf(LEXER=='dynamic', "%declare/postlex doesn't work with dynamic") + def test_postlex_declare(self): # Note: this test does a lot. maybe split it up? + class TestPostLexer: + def process(self, stream): + for t in stream: + if t.type == 'A': + t.type = 'B' + yield t + else: + yield t + + always_accept = ('A',) + + parser = _Lark(""" + start: B + A: "A" + %declare B + """, postlex=TestPostLexer()) + + test_file = "A" + tree = parser.parse(test_file) + self.assertEqual(tree.children, [Token('B', 'A')]) def test_import_custom_sources(self): custom_loader = FromPackageLoader('tests', ('grammars', )) diff --git a/tests/test_tools.py b/tests/test_tools.py index f7e0c47..ce995d8 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -25,7 +25,7 @@ class TestStandalone(TestCase): standalone.main(StringIO(grammar), 'start', print=pr) code = code_buf.getvalue() - context = {} + context = {'__doc__': None} exec(code, context) return context diff --git a/tests/test_trees.py b/tests/test_trees.py index cca92f9..905ad5a 100644 --- a/tests/test_trees.py +++ b/tests/test_trees.py @@ -20,7 +20,7 @@ class TestTrees(TestCase): def test_pickle(self): s = copy.deepcopy(self.tree1) - data = pickle.dumps(s) + data = pickle.dumps(s, protocol=pickle.HIGHEST_PROTOCOL) assert pickle.loads(data) == s def test_repr_runnable(self):