From da15f99edb23a916cc2c1eca87a87c653c4654f7 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Tue, 14 Feb 2017 13:12:00 +0200
Subject: [PATCH] Added the earley_nolex frontend, and a conf_nolex example to
 use it

---
 examples/calc.py          |  4 ++++
 examples/conf_nolex.py    | 42 +++++++++++++++++++++++++++++++++++++++
 examples/indented_tree.py | 14 ++++++++-----
 examples/json_parser.py   |  6 ++++++
 lark/parser_frontends.py  | 13 ++++++++----
 lark/parsers/earley.py    |  6 ++++--
 6 files changed, 74 insertions(+), 11 deletions(-)
 create mode 100644 examples/conf_nolex.py

diff --git a/examples/calc.py b/examples/calc.py
index 02574a8..dc936cf 100644
--- a/examples/calc.py
+++ b/examples/calc.py
@@ -1,3 +1,7 @@
+#
+# This example shows how to write a basic calculator with variables.
+#
+
 from lark import Lark, InlineTransformer
 
 calc_grammar = """
diff --git a/examples/conf_nolex.py b/examples/conf_nolex.py
new file mode 100644
index 0000000..6ae7340
--- /dev/null
+++ b/examples/conf_nolex.py
@@ -0,0 +1,42 @@
+#
+# This example demonstrates lex-less parsing using the earley_nolex frontend
+#
+# Using a lexer for configuration files is tricky, because values don't
+# have to be surrounded by delimiters.
+# In this example with skip lexing and let the Earley parser resolve the ambiguity.
+#
+# Future versions of lark will make it easier to write these kinds of grammars.
+#
+
+from lark import Lark, Transformer
+
+parser = Lark(r"""
+        start: _nl? section+
+        section: "[" name "]" _nl item+
+        item: name "=" value _nl
+        name: /[a-zA-Z_]/ /\w/*
+        value: /./+
+        _nl: (_CR? _LF)+
+
+        _CR : /\r/
+        _LF : /\n/
+    """, parser="earley_nolex")
+
+class RestoreTokens(Transformer):
+    value = ''.join
+    name = ''.join
+
+
+def test():
+    sample_conf = """
+[bla]
+
+a=Hello
+this="that",4
+"""
+
+    r = parser.parse(sample_conf)
+    print(RestoreTokens().transform(r).pretty())
+
+if __name__ == '__main__':
+    test()
diff --git a/examples/indented_tree.py b/examples/indented_tree.py
index 1a0a202..dc42086 100644
--- a/examples/indented_tree.py
+++ b/examples/indented_tree.py
@@ -1,8 +1,12 @@
-"""This example demonstrates usage of the Indenter class.
-
-Since indentation is context-sensitive, a postlex stage is introduced to manufacture INDENT/DEDENT tokens.
-It is crucial for the indenter that the NL_type matches the spaces (and tabs) after the newline.
-"""
+#
+# This example demonstrates usage of the Indenter class.
+#
+# Since indentation is context-sensitive, a postlex stage is introduced to
+# manufacture INDENT/DEDENT tokens.
+#
+# It is crucial for the indenter that the NL_type matches
+# the spaces (and tabs) after the newline.
+#
 
 from lark.lark import Lark
 from lark.indenter import Indenter
diff --git a/examples/json_parser.py b/examples/json_parser.py
index 2d520db..b29e7ab 100644
--- a/examples/json_parser.py
+++ b/examples/json_parser.py
@@ -1,3 +1,9 @@
+#
+# This example shows how to write a basic JSON parser
+#
+# The code is short and clear, but has good performance.
+#
+
 import sys
 
 from lark import Lark, inline_args, Transformer
diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
index fad3ed5..af1275d 100644
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -1,9 +1,10 @@
 import re
+import sre_parse
 
 from .lexer import Lexer
 from .parsers.lalr_analysis import GrammarAnalyzer
 
-from .common import is_terminal
+from .common import is_terminal, GrammarError
 from .parsers import lalr_parser, earley
 
 class WithLexer:
@@ -54,7 +55,7 @@ class Earley(WithLexer):
         assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
         return res[0]
 
-class Earley2:
+class Earley_NoLex:
     def __init__(self, lexer_conf, parser_conf):
         self.token_by_name = {t.name:t for t in lexer_conf.tokens}
 
@@ -68,7 +69,11 @@ class Earley2:
     def _prepare_expansion(self, expansion):
         for sym in expansion:
             if is_terminal(sym):
-                yield sym, re.compile(self.token_by_name[sym].to_regexp())
+                regexp = self.token_by_name[sym].to_regexp()
+                width = sre_parse.parse(regexp).getwidth()
+                if not width == (1,1):
+                    raise GrammarError('Dynamic lexing requires all tokens have the width 1 (%s is %s)' % (regexp, width))
+                yield sym, re.compile(regexp)
             else:
                 yield sym
 
@@ -77,4 +82,4 @@ class Earley2:
         assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
         return res[0]
 
-ENGINE_DICT = { 'lalr': LALR, 'earley': Earley }
+ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex }
diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py
index b8fb2ab..b2a511e 100644
--- a/lark/parsers/earley.py
+++ b/lark/parsers/earley.py
@@ -43,9 +43,11 @@ class State(object):
             # PORT: originally tests regexp
 
             if self.expect_symbol[1] is not None:
-                match = self.expect_symbol[1].match(stream, pos)
+                match = self.expect_symbol[1].match(inp)
+                if match:
+                    return self.next_state(inp)
 
-            if self.expect_symbol[0] == inp.type:
+            elif self.expect_symbol[0] == inp.type:
                 return self.next_state(inp)
 
     def consume_nonterminal(self, inp):