From a5a20a423adcefa860b035287740ace9310f4abc Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 25 Feb 2017 18:35:31 +0200 Subject: [PATCH] Changed parser/lexer interface in lark. Bumped minor version --- examples/calc.py | 4 ++-- examples/conf.py | 2 +- examples/conf_nolex.py | 2 +- examples/indented_tree.py | 5 +++-- lark/__init__.py | 2 +- lark/lark.py | 25 +++++++++++++++++++------ lark/lexer.py | 4 +++- lark/parser_frontends.py | 33 ++++++++++++++++++++++++++------- lark/reconstruct.py | 10 +++++----- tests/__main__.py | 2 +- tests/test_parser.py | 16 +++++++++++----- 11 files changed, 73 insertions(+), 32 deletions(-) diff --git a/examples/calc.py b/examples/calc.py index e5df631..83835cf 100644 --- a/examples/calc.py +++ b/examples/calc.py @@ -22,13 +22,13 @@ calc_grammar = """ | product "*" atom -> mul | product "/" atom -> div - ?atom: DECIMAL -> number + ?atom: NUMBER -> number | "-" atom -> neg | NAME -> var | "(" sum ")" %import common.CNAME -> NAME - %import common.DECIMAL + %import common.NUMBER %import common.WS_INLINE %ignore WS_INLINE diff --git a/examples/conf.py b/examples/conf.py index 9179605..13b928b 100644 --- a/examples/conf.py +++ b/examples/conf.py @@ -26,7 +26,7 @@ parser = Lark(r""" %ignore /[\t \f]+/ %ignore /\#[^\n]*/ - """, parser="lalr_contextual_lexer") + """, parser="lalr", lexer="contextual") sample_conf = """ diff --git a/examples/conf_nolex.py b/examples/conf_nolex.py index 7879b26..2b0f6d9 100644 --- a/examples/conf_nolex.py +++ b/examples/conf_nolex.py @@ -24,7 +24,7 @@ parser = Lark(r""" _CR : /\r/ _LF : /\n/ - """, parser="earley_nolex") + """, lexer=None) class RestoreTokens(Transformer): value = ''.join diff --git a/examples/indented_tree.py b/examples/indented_tree.py index dc42086..b633cdd 100644 --- a/examples/indented_tree.py +++ b/examples/indented_tree.py @@ -16,9 +16,10 @@ tree_grammar = r""" tree: NAME _NL [_INDENT tree+ _DEDENT] - NAME: /\w+/ + %import common.CNAME -> NAME + %import common.WS_INLINE + %ignore WS_INLINE - WS.ignore: /\s+/ _NL: /(\r?\n[\t ]*)+/ _INDENT: "" _DEDENT: "" diff --git a/lark/__init__.py b/lark/__init__.py index c7dd915..80a933c 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -3,4 +3,4 @@ from .common import ParseError, GrammarError from .lark import Lark from .utils import inline_args -__version__ = "0.1.2" +__version__ = "0.2.0" diff --git a/lark/lark.py b/lark/lark.py index 20fa6dd..67aeb96 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -11,7 +11,7 @@ from .common import GrammarError, LexerConf, ParserConf from .lexer import Lexer from .parse_tree_builder import ParseTreeBuilder -from .parser_frontends import ENGINE_DICT +from .parser_frontends import get_frontend class LarkOptions(object): """Specifies the options for Lark @@ -19,7 +19,13 @@ class LarkOptions(object): """ OPTIONS_DOC = """ parser - Which parser engine to use ("earley" or "lalr". Default: "earley") - Note: Both will use Lark's lexer. + Note: "lalr" requires a lexer + lexer - Whether or not to use a lexer stage + None: Don't use a lexer + "standard": Use a standard lexer + "contextual": Stronger lexer (only works with parser="lalr") + "auto" (default): Choose for me based on grammar and parser + transformer - Applies the transformer to every parse tree debug - Affects verbosity (default: False) only_lex - Don't build a parser. Useful for debugging (default: False) @@ -40,11 +46,12 @@ class LarkOptions(object): self.cache_grammar = o.pop('cache_grammar', False) self.postlex = o.pop('postlex', None) self.parser = o.pop('parser', 'earley') + self.lexer = o.pop('lexer', 'auto') self.transformer = o.pop('transformer', None) self.start = o.pop('start', 'start') self.profile = o.pop('profile', False) - assert self.parser in ENGINE_DICT + # assert self.parser in ENGINE_DICT if self.parser == 'earley' and self.transformer: raise ValueError('Cannot specify an auto-transformer when using the Earley algorithm.' 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. lalr)') @@ -118,9 +125,15 @@ class Lark: self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex) - if not self.options.only_lex: + if self.options.lexer == 'auto': + if self.options.parser == 'lalr': + self.options.lexer = 'standard' + elif self.options.parser == 'earley': + self.options.lexer = 'standard' + + if self.options.parser: self.parser = self._build_parser() - else: + elif self.options.lexer: self.lexer = self._build_lexer() if self.profiler: self.profiler.enter_section('outside_lark') @@ -131,7 +144,7 @@ class Lark: return Lexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore) def _build_parser(self): - self.parser_class = ENGINE_DICT[self.options.parser] + self.parser_class = get_frontend(self.options.parser, self.options.lexer) self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class) rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer) if self.profiler: diff --git a/lark/lexer.py b/lark/lexer.py index 799597b..c4d39c0 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -88,7 +88,9 @@ class Lexer(object): raise LexError("Cannot compile token: %s: %s" % (t.name, t.pattern)) token_names = {t.name for t in tokens} - assert all(t in token_names for t in ignore) + for t in ignore: + if t not in token_names: + raise LexError("Token '%s' was marked to ignore but it is not defined!" % t) # Init self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index cf9d14e..dfe35e8 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -131,7 +131,7 @@ class Earley_NoLex: def _prepare_expansion(self, expansion): for sym in expansion: if is_terminal(sym): - regexp = self.token_by_name[sym].to_regexp() + regexp = self.token_by_name[sym].pattern.to_regexp() width = sre_parse.parse(regexp).getwidth() if not width == (1,1): raise GrammarError('Dynamic lexing requires all tokens to have a width of 1 (%s is %s)' % (regexp, width)) @@ -144,9 +144,28 @@ class Earley_NoLex: assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' return res[0] -ENGINE_DICT = { - 'lalr': LALR, - 'earley': Earley, - 'earley_nolex': Earley_NoLex, - 'lalr_contextual_lexer': LALR_ContextualLexer -} + +def get_frontend(parser, lexer): + if parser=='lalr': + if lexer is None: + raise ValueError('The LALR parser requires use of a lexer') + elif lexer == 'standard': + return LALR + elif lexer == 'contextual': + return LALR_ContextualLexer + else: + raise ValueError('Unknown lexer: %s' % lexer) + elif parser=='earley': + if lexer is None: + return Earley_NoLex + elif lexer=='standard': + return Earley + elif lexer=='contextual': + raise ValueError('The Earley parser does not support the contextual parser') + else: + raise ValueError('Unknown lexer: %s' % lexer) + else: + raise ValueError('Unknown parser: %s' % parser) + + + diff --git a/lark/reconstruct.py b/lark/reconstruct.py index cab5aed..a9b45e1 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -2,8 +2,8 @@ import re from collections import defaultdict from .tree import Tree -from .common import is_terminal, ParserConf -from .lexer import Token, TokenDef__Str +from .common import is_terminal, ParserConf, PatternStr +from .lexer import Token from .parsers import earley from .lark import Lark @@ -22,7 +22,7 @@ def is_iter_empty(i): class Reconstructor: def __init__(self, parser): tokens = {t.name:t for t in parser.lexer_conf.tokens} - token_res = {t.name:re.compile(t.to_regexp()) for t in parser.lexer_conf.tokens} + token_res = {t.name:re.compile(t.pattern.to_regexp()) for t in parser.lexer_conf.tokens} class MatchData: def __init__(self, data): @@ -50,8 +50,8 @@ class Reconstructor: for sym in self.expansion: if is_discarded_terminal(sym): t = tokens[sym] - assert isinstance(t, TokenDef__Str) - to_write.append(t.value) + assert isinstance(t.pattern, PatternStr) + to_write.append(t.pattern.value) else: x = next(args2) if isinstance(x, list): diff --git a/tests/__main__.py b/tests/__main__.py index 7a6f9b3..a378822 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -5,7 +5,7 @@ import logging from .test_trees import TestTrees # from .test_selectors import TestSelectors -from .test_parser import TestLalr, TestEarley, TestLalr_contextual_lexer, TestParsers +from .test_parser import TestLalrStandard, TestEarleyStandard, TestLalrContextual, TestParsers # from .test_grammars import TestPythonG, TestConfigG logging.basicConfig(level=logging.INFO) diff --git a/tests/test_parser.py b/tests/test_parser.py index 3b9a7b9..1ab8cfe 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -42,9 +42,9 @@ class TestParsers(unittest.TestCase): class TestEarley(unittest.TestCase): pass -def _make_parser_test(PARSER): +def _make_parser_test(LEXER, PARSER): def _Lark(grammar, **kwargs): - return Lark(grammar, parser=PARSER, **kwargs) + return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs) class _TestParser(unittest.TestCase): def test_basic1(self): g = _Lark("""start: a+ b a* "b" a* @@ -397,12 +397,18 @@ def _make_parser_test(PARSER): g.parse("+2e-9") self.assertRaises(ParseError, g.parse, "+2e-9e") - _NAME = "Test" + PARSER.capitalize() + _NAME = "Test" + PARSER.capitalize() + (LEXER or 'None').capitalize() _TestParser.__name__ = _NAME globals()[_NAME] = _TestParser -for PARSER in ['lalr', 'earley', 'lalr_contextual_lexer']: - _make_parser_test(PARSER) +_TO_TEST = [ + ('standard', 'earley'), + ('standard', 'lalr'), + ('contextual', 'lalr'), +] + +for LEXER, PARSER in _TO_TEST: + _make_parser_test(LEXER, PARSER) if __name__ == '__main__':