From d4425887d6f5a6617fa4c52709b518c68bc31872 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Wed, 15 Feb 2017 10:33:22 +0200
Subject: [PATCH] Added conf.py example and indenter support in contextual
 lexing

---
 examples/conf.py         | 37 +++++++++++++++++++++++++++++++++++++
 examples/conf_nolex.py   |  6 +++++-
 lark/indenter.py         |  7 +++++--
 lark/lexer.py            | 10 +++++-----
 lark/parser_frontends.py | 10 +++++++---
 5 files changed, 59 insertions(+), 11 deletions(-)
 create mode 100644 examples/conf.py

diff --git a/examples/conf.py b/examples/conf.py
new file mode 100644
index 0000000..c872b09
--- /dev/null
+++ b/examples/conf.py
@@ -0,0 +1,37 @@
+#
+# This example demonstrates the power of the contextual lexer, by parsing a config file.
+#
+# The tokens NAME and VALUE match the same input. A regular lexer would arbitrarily
+# choose one over the other, which would lead to a (confusing) parse error.
+# However, due to the unambiguous structure of the grammar, the LALR(1) algorithm knows 
+# which one of them to expect at each point during the parse.
+# The lexer then only matches the tokens that the parser expects.
+# The result is a correct parse, something that is impossible with a regular lexer.
+#
+# Another approach is to discard a lexer altogether and use the Earley algorithm.
+# It will handle more cases than the contextual lexer, but at the cost of performance.
+# See examples/conf_nolex.py for an example of that approach.
+#
+
+from lark import Lark
+
+parser = Lark(r"""
+        start: _NL? section+
+        section: "[" NAME "]" _NL item+
+        item: NAME "=" VALUE _NL
+        NAME: /[a-zA-Z_]\w*/
+        VALUE: /.*/
+
+        WS.ignore: /[\t \f]+/
+        COMMENT.ignore: /\#[^\n]*/
+        _NL: /(\r?\n)+/
+    """, parser="lalr_contextual_lexer")
+
+
+sample_conf = """
+[bla]
+a=Hello
+this="that",4
+"""
+
+print parser.parse(sample_conf).pretty()
diff --git a/examples/conf_nolex.py b/examples/conf_nolex.py
index 6ae7340..7879b26 100644
--- a/examples/conf_nolex.py
+++ b/examples/conf_nolex.py
@@ -1,5 +1,5 @@
 #
-# This example demonstrates lex-less parsing using the earley_nolex frontend
+# This example demonstrates scanless parsing using the earley_nolex frontend
 #
 # Using a lexer for configuration files is tricky, because values don't
 # have to be surrounded by delimiters.
@@ -7,6 +7,10 @@
 #
 # Future versions of lark will make it easier to write these kinds of grammars.
 #
+# Another approach is to use the contextual lexer. It is less powerful than the scanless approach,
+# but it can handle some ambiguity in lexing and it's much faster since it uses LALR(1).
+# See examples/conf.py for an example of that approach.
+#
 
 from lark import Lark, Transformer
 
diff --git a/lark/indenter.py b/lark/indenter.py
index d6d27ed..24ac170 100644
--- a/lark/indenter.py
+++ b/lark/indenter.py
@@ -26,7 +26,6 @@ class Indenter:
 
             assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1])
 
-
     def process(self, stream):
         for token in stream:
             if token.type == self.NL_type:
@@ -37,7 +36,7 @@ class Indenter:
 
             if token.type in self.OPEN_PAREN_types:
                 self.paren_level += 1
-            if token.type in self.CLOSE_PAREN_types:
+            elif token.type in self.CLOSE_PAREN_types:
                 self.paren_level -= 1
                 assert self.paren_level >= 0
 
@@ -47,3 +46,7 @@ class Indenter:
 
         assert self.indent_level == [0], self.indent_level
 
+    # XXX Hack for ContextualLexer. Maybe there's a more elegant solution?
+    @property
+    def always_accept(self):
+        return (self.NL_type,)
diff --git a/lark/lexer.py b/lark/lexer.py
index 75c0d18..301d555 100644
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -173,7 +173,7 @@ class Lexer(object):
 
 
 class ContextualLexer:
-    def __init__(self, tokens, states, ignore=()):
+    def __init__(self, tokens, states, ignore=(), always_accept=()):
         tokens_by_name = {}
         for t in tokens:
             assert t.name not in tokens_by_name
@@ -186,10 +186,9 @@ class ContextualLexer:
             try:
                 lexer = lexer_by_tokens[key]
             except KeyError:
-                accepts = list(accepts) # For python3
-                accepts += ignore
-                # if '_NEWLINE' in tokens_by_name and '_NEWLINE' not in accepts:
-                #     accepts.append('_NEWLINE')  # XXX hack for now
+                accepts = set(accepts) # For python3
+                accepts |= set(ignore)
+                accepts |= set(always_accept)
                 state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end']
                 lexer = Lexer(state_tokens, ignore=ignore)
                 lexer_by_tokens[key] = lexer
@@ -228,6 +227,7 @@ class ContextualLexer:
                     break
             else:
                 if lex_pos < len(stream):
+                    print("Allowed tokens:", lexer.tokens)
                     raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
                 break
 
diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
index 0b9719b..668815c 100644
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -40,13 +40,17 @@ class LALR_ContextualLexer:
         self.analyzer.analyze()
 
         d = {idx:t.keys() for idx, t in self.analyzer.states_idx.items()}
-        self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore)
+        self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore,
+                                     always_accept=lexer_conf.postlex.always_accept
+                                                   if lexer_conf.postlex else ())
 
 
     def parse(self, text):
         parser = lalr_parser.Parser(self.analyzer, self.parser_conf.callback)
-        l = self.lexer.lex(text, parser)
-        return parser.parse(l, True)
+        tokens = self.lexer.lex(text, parser)
+        if self.lexer_conf.postlex:
+            tokens = self.lexer_conf.postlex.process(tokens)
+        return parser.parse(tokens, True)