From a5a20a423adcefa860b035287740ace9310f4abc Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Sat, 25 Feb 2017 18:35:31 +0200
Subject: [PATCH] Changed parser/lexer interface in lark. Bumped minor version

---
 examples/calc.py          |  4 ++--
 examples/conf.py          |  2 +-
 examples/conf_nolex.py    |  2 +-
 examples/indented_tree.py |  5 +++--
 lark/__init__.py          |  2 +-
 lark/lark.py              | 25 +++++++++++++++++++------
 lark/lexer.py             |  4 +++-
 lark/parser_frontends.py  | 33 ++++++++++++++++++++++++++-------
 lark/reconstruct.py       | 10 +++++-----
 tests/__main__.py         |  2 +-
 tests/test_parser.py      | 16 +++++++++++-----
 11 files changed, 73 insertions(+), 32 deletions(-)
diff --git a/examples/calc.py b/examples/calc.py
index e5df631..83835cf 100644
--- a/examples/calc.py
+++ b/examples/calc.py
@@ -22,13 +22,13 @@ calc_grammar = """
         | product "*" atom  -> mul
         | product "/" atom  -> div
 
-    ?atom: DECIMAL          -> number
+    ?atom: NUMBER           -> number
          | "-" atom         -> neg
          | NAME             -> var
          | "(" sum ")"
 
     %import common.CNAME -> NAME
-    %import common.DECIMAL
+    %import common.NUMBER
     %import common.WS_INLINE
 
     %ignore WS_INLINE
diff --git a/examples/conf.py b/examples/conf.py
index 9179605..13b928b 100644
--- a/examples/conf.py
+++ b/examples/conf.py
@@ -26,7 +26,7 @@ parser = Lark(r"""
 
         %ignore /[\t \f]+/
         %ignore /\#[^\n]*/
-    """, parser="lalr_contextual_lexer")
+    """, parser="lalr", lexer="contextual")
 
 
 sample_conf = """
diff --git a/examples/conf_nolex.py b/examples/conf_nolex.py
index 7879b26..2b0f6d9 100644
--- a/examples/conf_nolex.py
+++ b/examples/conf_nolex.py
@@ -24,7 +24,7 @@ parser = Lark(r"""
 
         _CR : /\r/
         _LF : /\n/
-    """, parser="earley_nolex")
+    """, lexer=None)
 
 class RestoreTokens(Transformer):
     value = ''.join
diff --git a/examples/indented_tree.py b/examples/indented_tree.py
index dc42086..b633cdd 100644
--- a/examples/indented_tree.py
+++ b/examples/indented_tree.py
@@ -16,9 +16,10 @@ tree_grammar = r"""
 
     tree: NAME _NL [_INDENT tree+ _DEDENT]
 
-    NAME: /\w+/
+    %import common.CNAME -> NAME
+    %import common.WS_INLINE
+    %ignore WS_INLINE
 
-    WS.ignore: /\s+/
     _NL: /(\r?\n[\t ]*)+/
     _INDENT: "<INDENT>"
     _DEDENT: "<DEDENT>"
diff --git a/lark/__init__.py b/lark/__init__.py
index c7dd915..80a933c 100644
--- a/lark/__init__.py
+++ b/lark/__init__.py
@@ -3,4 +3,4 @@ from .common import ParseError, GrammarError
 from .lark import Lark
 from .utils import inline_args
 
-__version__ = "0.1.2"
+__version__ = "0.2.0"
diff --git a/lark/lark.py b/lark/lark.py
index 20fa6dd..67aeb96 100644
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -11,7 +11,7 @@ from .common import GrammarError, LexerConf, ParserConf
 
 from .lexer import Lexer
 from .parse_tree_builder import ParseTreeBuilder
-from .parser_frontends import ENGINE_DICT
+from .parser_frontends import get_frontend
 
 class LarkOptions(object):
     """Specifies the options for Lark
@@ -19,7 +19,13 @@ class LarkOptions(object):
     """
     OPTIONS_DOC = """
         parser - Which parser engine to use ("earley" or "lalr". Default: "earley")
-                 Note: Both will use Lark's lexer.
+                 Note: "lalr" requires a lexer
+        lexer - Whether or not to use a lexer stage
+            None: Don't use a lexer
+            "standard": Use a standard lexer
+            "contextual": Stronger lexer (only works with parser="lalr")
+            "auto" (default): Choose for me based on grammar and parser
+
         transformer - Applies the transformer to every parse tree
         debug - Affects verbosity (default: False)
         only_lex - Don't build a parser. Useful for debugging (default: False)
@@ -40,11 +46,12 @@ class LarkOptions(object):
         self.cache_grammar = o.pop('cache_grammar', False)
         self.postlex = o.pop('postlex', None)
         self.parser = o.pop('parser', 'earley')
+        self.lexer = o.pop('lexer', 'auto')
         self.transformer = o.pop('transformer', None)
         self.start = o.pop('start', 'start')
         self.profile = o.pop('profile', False)
 
-        assert self.parser in ENGINE_DICT
+        # assert self.parser in ENGINE_DICT
         if self.parser == 'earley' and self.transformer:
             raise ValueError('Cannot specify an auto-transformer when using the Earley algorithm.'
                              'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. lalr)')
@@ -118,9 +125,15 @@ class Lark:
 
         self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex)
 
-        if not self.options.only_lex:
+        if self.options.lexer == 'auto':
+            if self.options.parser == 'lalr':
+                self.options.lexer = 'standard'
+            elif self.options.parser == 'earley':
+                self.options.lexer = 'standard'
+
+        if self.options.parser:
             self.parser = self._build_parser()
-        else:
+        elif self.options.lexer:
             self.lexer = self._build_lexer()
 
         if self.profiler: self.profiler.enter_section('outside_lark')
@@ -131,7 +144,7 @@ class Lark:
         return Lexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore)
 
     def _build_parser(self):
-        self.parser_class = ENGINE_DICT[self.options.parser]
+        self.parser_class = get_frontend(self.options.parser, self.options.lexer)
         self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class)
         rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer)
         if self.profiler:
diff --git a/lark/lexer.py b/lark/lexer.py
index 799597b..c4d39c0 100644
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -88,7 +88,9 @@ class Lexer(object):
                 raise LexError("Cannot compile token: %s: %s" % (t.name, t.pattern))
 
         token_names = {t.name for t in tokens}
-        assert all(t in token_names for t in ignore)
+        for t in ignore:
+            if t not in token_names:
+                raise LexError("Token '%s' was marked to ignore but it is not defined!" % t)
 
         # Init
         self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
index cf9d14e..dfe35e8 100644
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -131,7 +131,7 @@ class Earley_NoLex:
     def _prepare_expansion(self, expansion):
         for sym in expansion:
             if is_terminal(sym):
-                regexp = self.token_by_name[sym].to_regexp()
+                regexp = self.token_by_name[sym].pattern.to_regexp()
                 width = sre_parse.parse(regexp).getwidth()
                 if not width == (1,1):
                     raise GrammarError('Dynamic lexing requires all tokens to have a width of 1 (%s is %s)' % (regexp, width))
@@ -144,9 +144,28 @@ class Earley_NoLex:
         assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
         return res[0]
 
-ENGINE_DICT = {
-    'lalr': LALR,
-    'earley': Earley,
-    'earley_nolex': Earley_NoLex,
-    'lalr_contextual_lexer': LALR_ContextualLexer
-}
+
+def get_frontend(parser, lexer):
+    if parser=='lalr':
+        if lexer is None:
+            raise ValueError('The LALR parser requires use of a lexer')
+        elif lexer == 'standard':
+            return LALR
+        elif lexer == 'contextual':
+            return LALR_ContextualLexer
+        else:
+            raise ValueError('Unknown lexer: %s' % lexer)
+    elif parser=='earley':
+        if lexer is None:
+            return Earley_NoLex
+        elif lexer=='standard':
+            return Earley
+        elif lexer=='contextual':
+            raise ValueError('The Earley parser does not support the contextual parser')
+        else:
+            raise ValueError('Unknown lexer: %s' % lexer)
+    else:
+        raise ValueError('Unknown parser: %s' % parser)
+
+
+
diff --git a/lark/reconstruct.py b/lark/reconstruct.py
index cab5aed..a9b45e1 100644
--- a/lark/reconstruct.py
+++ b/lark/reconstruct.py
@@ -2,8 +2,8 @@ import re
 from collections import defaultdict
 
 from .tree import Tree
-from .common import is_terminal, ParserConf
-from .lexer import Token, TokenDef__Str
+from .common import is_terminal, ParserConf, PatternStr
+from .lexer import Token
 from .parsers import earley
 from .lark import Lark
 
@@ -22,7 +22,7 @@ def is_iter_empty(i):
 class Reconstructor:
     def __init__(self, parser):
         tokens = {t.name:t for t in parser.lexer_conf.tokens}
-        token_res = {t.name:re.compile(t.to_regexp()) for t in parser.lexer_conf.tokens}
+        token_res = {t.name:re.compile(t.pattern.to_regexp()) for t in parser.lexer_conf.tokens}
 
         class MatchData:
             def __init__(self, data):
@@ -50,8 +50,8 @@ class Reconstructor:
                 for sym in self.expansion:
                     if is_discarded_terminal(sym):
                         t = tokens[sym]
-                        assert isinstance(t, TokenDef__Str)
-                        to_write.append(t.value)
+                        assert isinstance(t.pattern, PatternStr)
+                        to_write.append(t.pattern.value)
                     else:
                         x = next(args2)
                         if isinstance(x, list):
diff --git a/tests/__main__.py b/tests/__main__.py
index 7a6f9b3..a378822 100644
--- a/tests/__main__.py
+++ b/tests/__main__.py
@@ -5,7 +5,7 @@ import logging
 
 from .test_trees import TestTrees
 # from .test_selectors import TestSelectors
-from .test_parser import TestLalr, TestEarley, TestLalr_contextual_lexer, TestParsers
+from .test_parser import TestLalrStandard, TestEarleyStandard, TestLalrContextual, TestParsers
 # from .test_grammars import TestPythonG, TestConfigG
 
 logging.basicConfig(level=logging.INFO)
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 3b9a7b9..1ab8cfe 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -42,9 +42,9 @@ class TestParsers(unittest.TestCase):
 class TestEarley(unittest.TestCase):
     pass
 
-def _make_parser_test(PARSER):
+def _make_parser_test(LEXER, PARSER):
     def _Lark(grammar, **kwargs):
-        return Lark(grammar, parser=PARSER, **kwargs)
+        return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs)
     class _TestParser(unittest.TestCase):
         def test_basic1(self):
             g = _Lark("""start: a+ b a* "b" a*
@@ -397,12 +397,18 @@ def _make_parser_test(PARSER):
             g.parse("+2e-9")
             self.assertRaises(ParseError, g.parse, "+2e-9e")
 
-    _NAME = "Test" + PARSER.capitalize()
+    _NAME = "Test" + PARSER.capitalize() + (LEXER or 'None').capitalize()
     _TestParser.__name__ = _NAME
     globals()[_NAME] = _TestParser
 
-for PARSER in ['lalr', 'earley', 'lalr_contextual_lexer']:
-    _make_parser_test(PARSER)
+_TO_TEST = [
+        ('standard', 'earley'),
+        ('standard', 'lalr'),
+        ('contextual', 'lalr'),
+]
+
+for LEXER, PARSER in _TO_TEST:
+    _make_parser_test(LEXER, PARSER)
 
 
 if __name__ == '__main__':