From 0ee80e675a74720a65bd5f637328a73d48e38503 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Sat, 6 Jan 2018 18:49:24 +0200
Subject: [PATCH 01/21] Refactoring for LALR, added the ParseTable class

---
 lark/parser_frontends.py         |  2 +-
 lark/parsers/grammar_analysis.py |  2 +-
 lark/parsers/lalr_analysis.py    | 61 +++++++++++++++++++++++---------
 lark/parsers/lalr_parser.py      | 29 +++++++--------
 4 files changed, 61 insertions(+), 33 deletions(-)

diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
index 718a0f9..ad5017b 100644
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -38,7 +38,7 @@ class LALR_ContextualLexer:
 
         self.parser = lalr_parser.Parser(parser_conf)
 
-        d = {idx:t.keys() for idx, t in self.parser.analysis.states_idx.items()}
+        d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()}
         always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else ()
         self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept)
 
diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py
index 9250c47..391e3dd 100644
--- a/lark/parsers/grammar_analysis.py
+++ b/lark/parsers/grammar_analysis.py
@@ -125,7 +125,7 @@ class GrammarAnalyzer(object):
                 if not (is_terminal(sym) or sym in self.rules_by_origin):
                     raise GrammarError("Using an undefined rule: %s" % sym)
 
-        self.init_state = self.expand_rule('$root')
+        self.start_state = self.expand_rule('$root')
 
         self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(self.rules)
 
diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py
index e763b08..3f2d30f 100644
--- a/lark/parsers/lalr_analysis.py
+++ b/lark/parsers/lalr_analysis.py
@@ -14,7 +14,41 @@ from ..common import GrammarError, is_terminal
 
 from .grammar_analysis import GrammarAnalyzer
 
-ACTION_SHIFT = 0
+class Action:
+    def __str__(self):
+        return self.__name__
+    def __repr__(self):
+        return str(self)
+
+class Shift(Action): pass
+class Reduce(Action): pass
+
+class ParseTable:
+    def __init__(self, states, start_state, end_state):
+        self.states = states
+        self.start_state = start_state
+        self.end_state = end_state
+
+class IntParseTable(ParseTable):
+
+    @classmethod
+    def from_ParseTable(cls, parse_table):
+        enum = list(parse_table.states)
+        state_to_idx = {s:i for i,s in enumerate(enum)}
+        int_states = {}
+
+        for s, la in parse_table.states.items():
+            la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v
+                  for k,v in la.items()}
+            int_states[ state_to_idx[s] ] = la
+
+
+        start_state = state_to_idx[parse_table.start_state]
+        end_state = state_to_idx[parse_table.end_state]
+        return cls(int_states, start_state, end_state)
+
+
+
 
 class LALR_Analyzer(GrammarAnalyzer):
 
@@ -27,7 +61,7 @@ class LALR_Analyzer(GrammarAnalyzer):
             sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied)
             for rp in sat:
                 for term in self.FOLLOW.get(rp.rule.origin, ()):
-                    lookahead[term].append(('reduce', rp.rule))
+                    lookahead[term].append((Reduce, rp.rule))
 
             d = classify(unsat, lambda rp: rp.next)
             for sym, rps in d.items():
@@ -38,7 +72,7 @@ class LALR_Analyzer(GrammarAnalyzer):
                         rps |= self.expand_rule(rp.next)
 
                 new_state = fzset(rps)
-                lookahead[sym].append(('shift', new_state))
+                lookahead[sym].append((Shift, new_state))
                 if sym == '$end':
                     self.end_states.append( new_state )
                 yield fzset(rps)
@@ -50,7 +84,7 @@ class LALR_Analyzer(GrammarAnalyzer):
                     for x in v:
                         # XXX resolving shift/reduce into shift, like PLY
                         # Give a proper warning
-                        if x[0] == 'shift':
+                        if x[0] is Shift:
                             lookahead[k] = [x]
 
             for k, v in lookahead.items():
@@ -59,22 +93,15 @@ class LALR_Analyzer(GrammarAnalyzer):
 
             self.states[state] = {k:v[0] for k, v in lookahead.items()}
 
-        for _ in bfs([self.init_state], step):
+        for _ in bfs([self.start_state], step):
             pass
 
         self.end_state ,= self.end_states
 
-        # --
-        self.enum = list(self.states)
-        self.enum_rev = {s:i for i,s in enumerate(self.enum)}
-        self.states_idx = {}
-
-        for s, la in self.states.items():
-            la = {k:(ACTION_SHIFT, self.enum_rev[v[1]]) if v[0]=='shift'
-                    else (v[0], (v[1], len(v[1].expansion)))    # Reduce
-                  for k,v in la.items()}
-            self.states_idx[ self.enum_rev[s] ] = la
+        self._parse_table = ParseTable(self.states, self.start_state, self.end_state)
 
+        if self.debug:
+            self.parse_table = self._parse_table
+        else:
+            self.parse_table = IntParseTable.from_ParseTable(self._parse_table)
 
-        self.init_state_idx = self.enum_rev[self.init_state]
-        self.end_state_idx = self.enum_rev[self.end_state]
diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py
index f224bec..c913661 100644
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -5,7 +5,7 @@
 
 from ..common import ParseError, UnexpectedToken
 
-from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT
+from .lalr_analysis import LALR_Analyzer, Shift
 
 class FinalReduce:
     def __init__(self, value):
@@ -19,14 +19,14 @@ class Parser:
         callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None)
                           for rule in analysis.rules}
 
-        self.parser = _Parser(analysis.states_idx, analysis.init_state_idx, analysis.end_state_idx, callbacks)
+        self.parser = _Parser(analysis.parse_table, callbacks)
         self.parse = self.parser.parse
 
 class _Parser:
-    def __init__(self, states, init_state, end_state, callbacks):
-        self.states = states
-        self.init_state = init_state
-        self.end_state = end_state
+    def __init__(self, parse_table, callbacks):
+        self.states = parse_table.states
+        self.start_state = parse_table.start_state
+        self.end_state = parse_table.end_state
         self.callbacks = callbacks
 
     def parse(self, seq, set_state=None):
@@ -35,10 +35,10 @@ class _Parser:
         stream = iter(seq)
         states = self.states
 
-        state_stack = [self.init_state]
+        state_stack = [self.start_state]
         value_stack = []
 
-        if set_state: set_state(self.init_state)
+        if set_state: set_state(self.start_state)
 
         def get_action(key):
             state = state_stack[-1]
@@ -49,7 +49,8 @@ class _Parser:
 
                 raise UnexpectedToken(token, expected, seq, i)
 
-        def reduce(rule, size):
+        def reduce(rule):
+            size = len(rule.expansion)
             if size:
                 s = value_stack[-size:]
                 del state_stack[-size:]
@@ -60,7 +61,7 @@ class _Parser:
             value = self.callbacks[rule](s)
 
             _action, new_state = get_action(rule.origin)
-            assert _action == ACTION_SHIFT
+            assert _action is Shift
             state_stack.append(new_state)
             value_stack.append(value)
 
@@ -72,22 +73,22 @@ class _Parser:
                 action, arg = get_action(token.type)
                 assert arg != self.end_state
 
-                if action == ACTION_SHIFT:
+                if action is Shift:
                     state_stack.append(arg)
                     value_stack.append(token)
                     if set_state: set_state(arg)
                     token = next(stream)
                     i += 1
                 else:
-                    reduce(*arg)
+                    reduce(arg)
         except StopIteration:
             pass
 
         while True:
             _action, arg = get_action('$end')
-            if _action == ACTION_SHIFT:
+            if _action is Shift:
                 assert arg == self.end_state
                 val ,= value_stack
                 return val
             else:
-                reduce(*arg)
+                reduce(arg)

From 1cc4c965e87e89b686c0f8c7b01349bdcf3e8ddb Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Sun, 7 Jan 2018 00:50:40 +0200
Subject: [PATCH 02/21] Big Refactor: Grammars now build in half the time. Code
 shorter & cleaner.

---
 lark/common.py                   | 27 +------------
 lark/grammar.py                  | 16 ++++++++
 lark/lark.py                     |  1 +
 lark/lexer.py                    |  2 +-
 lark/parse_tree_builder.py       |  5 ++-
 lark/parser_frontends.py         | 67 ++++++++++++++++----------------
 lark/parsers/earley.py           | 24 ++++++------
 lark/parsers/grammar_analysis.py | 43 ++++++++------------
 lark/parsers/lalr_analysis.py    |  2 +-
 lark/parsers/lalr_parser.py      |  5 ++-
 lark/parsers/xearley.py          | 30 +++++++-------
 11 files changed, 104 insertions(+), 118 deletions(-)
 create mode 100644 lark/grammar.py

diff --git a/lark/common.py b/lark/common.py
index 55e9d28..800aa4f 100644
--- a/lark/common.py
+++ b/lark/common.py
@@ -33,7 +33,7 @@ class UnexpectedToken(ParseError):
 
 
 def is_terminal(sym):
-    return isinstance(sym, Terminal) or sym.isupper() or sym == '$end'
+    return sym.isupper()
 
 
 class LexerConf:
@@ -44,7 +44,6 @@ class LexerConf:
 
 class ParserConf:
     def __init__(self, rules, callback, start):
-        assert all(len(r) == 4 for r in rules)
         self.rules = rules
         self.callback = callback
         self.start = start
@@ -108,27 +107,3 @@ class TokenDef(object):
     def __repr__(self):
         return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)
 
-
-class Terminal:
-    def __init__(self, data):
-        self.data = data
-
-    def __repr__(self):
-        return '%r' % self.data
-
-    def __eq__(self, other):
-        return isinstance(other, type(self)) and self.data == other.data
-    def __hash__(self):
-        return hash(self.data)
-
-
-class Terminal_Regexp(Terminal):
-    def __init__(self, name, regexp):
-        Terminal.__init__(self, regexp)
-        self.name = name
-        self.match = re.compile(regexp).match
-
-class Terminal_Token(Terminal):
-    def match(self, other):
-        return self.data == other.type
-
diff --git a/lark/grammar.py b/lark/grammar.py
new file mode 100644
index 0000000..281c21c
--- /dev/null
+++ b/lark/grammar.py
@@ -0,0 +1,16 @@
+
+class Rule(object):
+    """
+        origin : a symbol
+        expansion : a list of symbols
+    """
+    def __init__(self, origin, expansion, alias=None, options=None):
+        self.origin = origin
+        self.expansion = expansion
+        self.alias = alias
+        self.options = options
+
+    def __repr__(self):
+        return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion)))
+
+
diff --git a/lark/lark.py b/lark/lark.py
index d8ee186..03bd253 100644
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -171,6 +171,7 @@ class Lark:
             for f in dir(callback):
                 if not (f.startswith('__') and f.endswith('__')):
                     setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f)))
+
         parser_conf = ParserConf(rules, callback, self.options.start)
 
         return self.parser_class(self.lexer_conf, parser_conf, options=self.options)
diff --git a/lark/lexer.py b/lark/lexer.py
index 2741af0..66923b0 100644
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -204,7 +204,7 @@ class ContextualLexer:
                 lexer = lexer_by_tokens[key]
             except KeyError:
                 accepts = set(accepts) | set(ignore) | set(always_accept)
-                state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end']
+                state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END']
                 lexer = Lexer(state_tokens, ignore=ignore)
                 lexer_by_tokens[key] = lexer
 
diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py
index 975121d..497af55 100644
--- a/lark/parse_tree_builder.py
+++ b/lark/parse_tree_builder.py
@@ -1,6 +1,7 @@
 from .common import is_terminal, GrammarError
 from .utils import suppress
 from .lexer import Token
+from .grammar import Rule
 
 class NodeBuilder:
     def __init__(self, tree_class, name):
@@ -27,7 +28,7 @@ class Factory:
 
     def __call__(self, node_builder):
         return self.cls(node_builder, *self.args)
-                 
+
 
 class TokenWrapper:
     "Used for fixing the results of scanless parsing"
@@ -151,6 +152,6 @@ class ParseTreeBuilder:
                 raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin))
             setattr(callback, callback_name, f)
 
-            new_rules.append(( origin, expansion, callback_name, options ))
+            new_rules.append( Rule( origin, expansion, callback_name, options ))
 
         return new_rules, callback
diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
index ad5017b..228640f 100644
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -3,7 +3,7 @@ import sre_parse
 
 from .lexer import Lexer, ContextualLexer, Token
 
-from .common import is_terminal, GrammarError, ParserConf, Terminal_Regexp, Terminal_Token
+from .common import is_terminal, GrammarError, ParserConf
 from .parsers import lalr_parser, earley, xearley, resolve_ambig
 
 class WithLexer:
@@ -70,25 +70,26 @@ def tokenize_text(text):
 
 class Earley_NoLex:
     def __init__(self, lexer_conf, parser_conf, options=None):
-        self.token_by_name = {t.name:t for t in lexer_conf.tokens}
-
-        rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules]
+        self._prepare_match(lexer_conf)
 
-        self.parser = earley.Parser(rules,
+        self.parser = earley.Parser(parser_conf.rules,
                                     parser_conf.start,
                                     parser_conf.callback,
+                                    self.match,
                                     resolve_ambiguity=get_ambiguity_resolver(options))
 
-    def _prepare_expansion(self, expansion):
-        for sym in expansion:
-            if is_terminal(sym):
-                regexp = self.token_by_name[sym].pattern.to_regexp()
-                width = sre_parse.parse(regexp).getwidth()
-                if width != (1,1):
-                    raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width))
-                yield Terminal_Regexp(sym, regexp)
-            else:
-                yield sym
+
+    def match(self, term, text, index=0):
+        return self.regexps[term].match(text, index)
+
+    def _prepare_match(self, lexer_conf):
+        self.regexps = {}
+        for t in lexer_conf.tokens:
+            regexp = t.pattern.to_regexp()
+            width = sre_parse.parse(regexp).getwidth()
+            if width != (1,1):
+                raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width))
+            self.regexps[t.name] = re.compile(regexp)
 
     def parse(self, text):
         new_text = tokenize_text(text)
@@ -98,15 +99,14 @@ class Earley(WithLexer):
     def __init__(self, lexer_conf, parser_conf, options=None):
         WithLexer.__init__(self, lexer_conf)
 
-        rules = [(n, self._prepare_expansion(x), a, o) for n,x,a,o in parser_conf.rules]
-
-        self.parser = earley.Parser(rules,
+        self.parser = earley.Parser(parser_conf.rules,
                                     parser_conf.start,
                                     parser_conf.callback,
+                                    self.match,
                                     resolve_ambiguity=get_ambiguity_resolver(options))
 
-    def _prepare_expansion(self, expansion):
-        return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion]
+    def match(self, term, token):
+        return term == token.type
 
     def parse(self, text):
         tokens = self.lex(text)
@@ -117,27 +117,26 @@ class XEarley:
     def __init__(self, lexer_conf, parser_conf, options=None):
         self.token_by_name = {t.name:t for t in lexer_conf.tokens}
 
-        rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules]
-
-        ignore = [Terminal_Regexp(x, self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore]
+        self._prepare_match(lexer_conf)
 
-        self.parser = xearley.Parser(rules,
+        self.parser = xearley.Parser(parser_conf.rules,
                                     parser_conf.start,
                                     parser_conf.callback,
+                                    self.match,
                                     resolve_ambiguity=get_ambiguity_resolver(options),
-                                    ignore=ignore,
+                                    ignore=lexer_conf.ignore,
                                     predict_all=options.earley__predict_all
                                     )
 
-    def _prepare_expansion(self, expansion):
-        for sym in expansion:
-            if is_terminal(sym):
-                regexp = self.token_by_name[sym].pattern.to_regexp()
-                width = sre_parse.parse(regexp).getwidth()
-                assert width
-                yield Terminal_Regexp(sym, regexp)
-            else:
-                yield sym
+    def match(self, term, text, index=0):
+        return self.regexps[term].match(text, index)
+
+    def _prepare_match(self, lexer_conf):
+        self.regexps = {}
+        for t in lexer_conf.tokens:
+            regexp = t.pattern.to_regexp()
+            assert sre_parse.parse(regexp).getwidth()
+            self.regexps[t.name] = re.compile(regexp)
 
     def parse(self, text):
         return self.parser.parse(text)
diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py
index 55893f5..e6a914d 100644
--- a/lark/parsers/earley.py
+++ b/lark/parsers/earley.py
@@ -13,13 +13,13 @@
 # Author: Erez Shinan (2017)
 # Email : erezshin@gmail.com
 
-from ..common import ParseError, UnexpectedToken, Terminal
+from ..common import ParseError, UnexpectedToken, is_terminal
 from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse
 from .grammar_analysis import GrammarAnalyzer
 
 
 class EndToken:
-    type = '$end'
+    type = '$END'
 
 class Derivation(Tree):
     _hash = None
@@ -135,7 +135,7 @@ class Column:
                     self.completed[item_key] = item
                 self.to_reduce.append(item)
             else:
-                if isinstance(item.expect, Terminal):
+                if is_terminal(item.expect):
                     self.to_scan.append(item)
                 else:
                     k = item_key if self.predict_all else item
@@ -152,7 +152,7 @@ class Column:
     __nonzero__ = __bool__  # Py2 backwards-compatibility
 
 class Parser:
-    def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None):
+    def __init__(self, rules, start_symbol, callback, term_matcher, resolve_ambiguity=None):
         self.analysis = GrammarAnalyzer(rules, start_symbol)
         self.start_symbol = start_symbol
         self.resolve_ambiguity = resolve_ambiguity
@@ -161,12 +161,13 @@ class Parser:
         self.predictions = {}
         self.FIRST = {}
         for rule in self.analysis.rules:
-            if rule.origin != '$root':  # XXX kinda ugly
-                a = rule.alias
-                self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
-                self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
+            a = rule.alias
+            self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
+            self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
 
-                self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin]
+            self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin]
+
+        self.term_matcher = term_matcher
 
 
     def parse(self, stream, start_symbol=None):
@@ -174,9 +175,10 @@ class Parser:
         start_symbol = start_symbol or self.start_symbol
 
         _Item = Item
+        match = self.term_matcher
 
         def predict(nonterm, column):
-            assert not isinstance(nonterm, Terminal), nonterm
+            assert not is_terminal(nonterm), nonterm
             return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]]
 
         def complete(item):
@@ -203,7 +205,7 @@ class Parser:
 
         def scan(i, token, column):
             next_set = Column(i, self.FIRST)
-            next_set.add(item.advance(token) for item in column.to_scan if item.expect.match(token))
+            next_set.add(item.advance(token) for item in column.to_scan if match(item.expect, token))
 
             if not next_set:
                 expect = {i.expect for i in column.to_scan}
diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py
index 391e3dd..7390c58 100644
--- a/lark/parsers/grammar_analysis.py
+++ b/lark/parsers/grammar_analysis.py
@@ -1,20 +1,8 @@
 
 from ..utils import bfs, fzset
 from ..common import GrammarError, is_terminal
+from ..grammar import Rule
 
-class Rule(object):
-    """
-        origin : a symbol
-        expansion : a list of symbols
-    """
-    def __init__(self, origin, expansion, alias=None, options=None):
-        self.origin = origin
-        self.expansion = expansion
-        self.alias = alias
-        self.options = options
-
-    def __repr__(self):
-        return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion)))
 
 class RulePtr(object):
     def __init__(self, rule, index):
@@ -106,28 +94,29 @@ def calculate_sets(rules):
 
 
 class GrammarAnalyzer(object):
-    def __init__(self, rule_tuples, start_symbol, debug=False):
+    def __init__(self, rules, start_symbol, debug=False):
+        assert len(rules) == len(set(rules))
+
         self.start_symbol = start_symbol
         self.debug = debug
-        rule_tuples = list(rule_tuples)
-        rule_tuples.append(('$root', [start_symbol, '$end']))
-        rule_tuples = [(t[0], t[1], None, None) if len(t)==2 else t for t in rule_tuples]
-
-        self.rules = set()
-        self.rules_by_origin = {o: [] for o, _x, _a, _opt in rule_tuples}
-        for origin, exp, alias, options in rule_tuples:
-            r =  Rule( origin, exp, alias, options )
-            self.rules.add(r)
-            self.rules_by_origin[origin].append(r)
-
-        for r in self.rules:
+
+        root_rule = Rule('$root', [start_symbol, '$END'])
+
+        self.rules_by_origin = {r.origin: [] for r in rules}
+        for r in rules:
+            self.rules_by_origin[r.origin].append(r)
+
+        self.rules_by_origin[root_rule.origin] = [root_rule]
+
+        for r in rules:
             for sym in r.expansion:
                 if not (is_terminal(sym) or sym in self.rules_by_origin):
                     raise GrammarError("Using an undefined rule: %s" % sym)
 
         self.start_state = self.expand_rule('$root')
+        self.rules = rules
 
-        self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(self.rules)
+        self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules + [root_rule])
 
     def expand_rule(self, rule):
         "Returns all init_ptrs accessible by rule (recursive)"
diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py
index 3f2d30f..2c9e8a4 100644
--- a/lark/parsers/lalr_analysis.py
+++ b/lark/parsers/lalr_analysis.py
@@ -73,7 +73,7 @@ class LALR_Analyzer(GrammarAnalyzer):
 
                 new_state = fzset(rps)
                 lookahead[sym].append((Shift, new_state))
-                if sym == '$end':
+                if sym == '$END':
                     self.end_states.append( new_state )
                 yield fzset(rps)
 
diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py
index c913661..237619d 100644
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -13,7 +13,8 @@ class FinalReduce:
 
 class Parser:
     def __init__(self, parser_conf):
-        assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization"
+        assert all(r.options is None or r.options.priority is None
+                   for r in parser_conf.rules), "LALR doesn't yet support prioritization"
         self.analysis = analysis = LALR_Analyzer(parser_conf.rules, parser_conf.start)
         analysis.compute_lookahead()
         callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None)
@@ -85,7 +86,7 @@ class _Parser:
             pass
 
         while True:
-            _action, arg = get_action('$end')
+            _action, arg = get_action('$END')
             if _action is Shift:
                 assert arg == self.end_state
                 val ,= value_stack
diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py
index 9b26190..055b26e 100644
--- a/lark/parsers/xearley.py
+++ b/lark/parsers/xearley.py
@@ -20,7 +20,7 @@
 
 from collections import defaultdict
 
-from ..common import ParseError, UnexpectedToken, Terminal
+from ..common import ParseError, UnexpectedToken, is_terminal
 from ..lexer import Token, UnexpectedInput
 from ..tree import Tree
 from .grammar_analysis import GrammarAnalyzer
@@ -28,7 +28,7 @@ from .grammar_analysis import GrammarAnalyzer
 from .earley import ApplyCallbacks, Item, Column
 
 class Parser:
-    def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=(), predict_all=False):
+    def __init__(self, rules, start_symbol, callback, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False):
         self.analysis = GrammarAnalyzer(rules, start_symbol)
         self.start_symbol = start_symbol
         self.resolve_ambiguity = resolve_ambiguity
@@ -41,24 +41,26 @@ class Parser:
         self.FIRST = {}
 
         for rule in self.analysis.rules:
-            if rule.origin != '$root':  # XXX kinda ugly
-                a = rule.alias
-                self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
-                self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
+            a = rule.alias
+            self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
+            self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
 
-                self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin]
+            self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin]
+
+        self.term_matcher = term_matcher
 
 
     def parse(self, stream, start_symbol=None):
         # Define parser functions
         start_symbol = start_symbol or self.start_symbol
         delayed_matches = defaultdict(list)
+        match = self.term_matcher
 
         text_line = 1
         text_column = 0
 
         def predict(nonterm, column):
-            assert not isinstance(nonterm, Terminal), nonterm
+            assert not is_terminal(nonterm), nonterm
             return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]]
 
         def complete(item):
@@ -86,7 +88,7 @@ class Parser:
             to_scan = column.to_scan
 
             for x in self.ignore:
-                m = x.match(stream, i)
+                m = match(x, stream, i)
                 if m:
                     delayed_matches[m.end()] += set(to_scan)
                     delayed_matches[m.end()] += set(column.to_reduce)
@@ -99,16 +101,16 @@ class Parser:
                     #         delayed_matches[m.end()] += to_scan
 
             for item in to_scan:
-                m = item.expect.match(stream, i)
+                m = match(item.expect, stream, i)
                 if m:
-                    t = Token(item.expect.name, m.group(0), i, text_line, text_column)
+                    t = Token(item.expect, m.group(0), i, text_line, text_column)
                     delayed_matches[m.end()].append(item.advance(t))
 
                     s = m.group(0)
                     for j in range(1, len(s)):
-                        m = item.expect.match(s[:-j])
+                        m = match(item.expect, s[:-j])
                         if m:
-                            t = Token(item.expect.name, m.group(0), i, text_line, text_column)
+                            t = Token(item.expect, m.group(0), i, text_line, text_column)
                             delayed_matches[i+m.end()].append(item.advance(t))
 
             next_set = Column(i+1, self.FIRST, predict_all=self.predict_all)
@@ -143,7 +145,7 @@ class Parser:
                      if n.rule.origin==start_symbol and n.start is column0]
 
         if not solutions:
-            expected_tokens = [t.expect.name for t in column.to_scan]
+            expected_tokens = [t.expect for t in column.to_scan]
             raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens)
 
         elif len(solutions) == 1:

From 39e58cb8fdb5bec30d8b44514fd75f0c70c86d10 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Sun, 7 Jan 2018 11:15:30 +0200
Subject: [PATCH 03/21] Post-refactor cleanup

---
 lark/parser_frontends.py         | 56 ++++++++++++--------------------
 lark/parsers/earley.py           | 53 ++++++------------------------
 lark/parsers/grammar_analysis.py |  7 ++--
 lark/parsers/lalr_parser.py      |  2 +-
 lark/parsers/xearley.py          | 24 +++++---------
 5 files changed, 45 insertions(+), 97 deletions(-)

diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
index 228640f..e8e7ab8 100644
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -7,10 +7,16 @@ from .common import is_terminal, GrammarError, ParserConf
 from .parsers import lalr_parser, earley, xearley, resolve_ambig
 
 class WithLexer:
-    def __init__(self, lexer_conf):
+    def init_traditional_lexer(self, lexer_conf):
         self.lexer_conf = lexer_conf
         self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore)
 
+    def init_contextual_lexer(self, lexer_conf, parser_conf):
+        self.lexer_conf = lexer_conf
+        d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()}
+        always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else ()
+        self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept)
+
     def lex(self, text):
         stream = self.lexer.lex(text)
         if self.lexer_conf.postlex:
@@ -21,32 +27,22 @@ class WithLexer:
 
 class LALR(WithLexer):
     def __init__(self, lexer_conf, parser_conf, options=None):
-        WithLexer.__init__(self, lexer_conf)
-
-        self.parser_conf = parser_conf
         self.parser = lalr_parser.Parser(parser_conf)
+        self.init_traditional_lexer(lexer_conf)
 
     def parse(self, text):
-        tokens = self.lex(text)
-        return self.parser.parse(tokens)
+        token_stream = self.lex(text)
+        return self.parser.parse(token_stream)
 
 
-class LALR_ContextualLexer:
+class LALR_ContextualLexer(WithLexer):
     def __init__(self, lexer_conf, parser_conf, options=None):
-        self.lexer_conf = lexer_conf
-        self.parser_conf = parser_conf
-
         self.parser = lalr_parser.Parser(parser_conf)
-
-        d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()}
-        always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else ()
-        self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept)
+        self.init_contextual_lexer(lexer_conf, parser_conf)
 
     def parse(self, text):
-        tokens = self.lexer.lex(text)
-        if self.lexer_conf.postlex:
-            tokens = self.lexer_conf.postlex.process(tokens)
-        return self.parser.parse(tokens, self.lexer.set_parser_state)
+        token_stream = self.lex(text)
+        return self.parser.parse(token_stream, self.lexer.set_parser_state)
 
 def get_ambiguity_resolver(options):
     if not options or options.ambiguity == 'resolve':
@@ -58,24 +54,19 @@ def get_ambiguity_resolver(options):
     raise ValueError(options)
 
 def tokenize_text(text):
-    new_text = []
     line = 1
     col_start_pos = 0
     for i, ch in enumerate(text):
         if '\n' in ch:
             line += ch.count('\n')
             col_start_pos = i + ch.rindex('\n')
-        new_text.append(Token('CHAR', ch, line=line, column=i - col_start_pos))
-    return new_text
+        yield Token('CHAR', ch, line=line, column=i - col_start_pos)
 
 class Earley_NoLex:
     def __init__(self, lexer_conf, parser_conf, options=None):
         self._prepare_match(lexer_conf)
 
-        self.parser = earley.Parser(parser_conf.rules,
-                                    parser_conf.start,
-                                    parser_conf.callback,
-                                    self.match,
+        self.parser = earley.Parser(parser_conf, self.match,
                                     resolve_ambiguity=get_ambiguity_resolver(options))
 
 
@@ -92,17 +83,14 @@ class Earley_NoLex:
             self.regexps[t.name] = re.compile(regexp)
 
     def parse(self, text):
-        new_text = tokenize_text(text)
-        return self.parser.parse(new_text)
+        token_stream = tokenize_text(text)
+        return self.parser.parse(token_stream)
 
 class Earley(WithLexer):
     def __init__(self, lexer_conf, parser_conf, options=None):
-        WithLexer.__init__(self, lexer_conf)
+        self.init_traditional_lexer(lexer_conf)
 
-        self.parser = earley.Parser(parser_conf.rules,
-                                    parser_conf.start,
-                                    parser_conf.callback,
-                                    self.match,
+        self.parser = earley.Parser(parser_conf, self.match,
                                     resolve_ambiguity=get_ambiguity_resolver(options))
 
     def match(self, term, token):
@@ -119,9 +107,7 @@ class XEarley:
 
         self._prepare_match(lexer_conf)
 
-        self.parser = xearley.Parser(parser_conf.rules,
-                                    parser_conf.start,
-                                    parser_conf.callback,
+        self.parser = xearley.Parser(parser_conf,
                                     self.match,
                                     resolve_ambiguity=get_ambiguity_resolver(options),
                                     ignore=lexer_conf.ignore,
diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py
index e6a914d..62d3e15 100644
--- a/lark/parsers/earley.py
+++ b/lark/parsers/earley.py
@@ -18,9 +18,6 @@ from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse
 from .grammar_analysis import GrammarAnalyzer
 
 
-class EndToken:
-    type = '$END'
-
 class Derivation(Tree):
     _hash = None
 
@@ -36,8 +33,6 @@ class Derivation(Tree):
             self._hash = Tree.__hash__(self)
         return self._hash
 
-END_TOKEN = EndToken()
-
 class Item(object):
     "An Earley Item, the atom of the algorithm."
 
@@ -60,11 +55,8 @@ class Item(object):
         new_tree = Derivation(self.rule, self.tree.children + [tree])
         return self.__class__(self.rule, self.ptr+1, self.start, new_tree)
 
-    def similar(self, other):
-        return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule
-
     def __eq__(self, other):
-        return self.similar(other) #and (self.tree == other.tree)
+        return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule
 
     def __hash__(self):
         return hash((self.rule, self.ptr, id(self.start)))   # Always runs Derivation.__hash__
@@ -152,27 +144,24 @@ class Column:
     __nonzero__ = __bool__  # Py2 backwards-compatibility
 
 class Parser:
-    def __init__(self, rules, start_symbol, callback, term_matcher, resolve_ambiguity=None):
-        self.analysis = GrammarAnalyzer(rules, start_symbol)
-        self.start_symbol = start_symbol
+    def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None):
+        self.analysis = GrammarAnalyzer(parser_conf)
+        self.parser_conf = parser_conf
         self.resolve_ambiguity = resolve_ambiguity
 
+        self.FIRST = self.analysis.FIRST
         self.postprocess = {}
         self.predictions = {}
-        self.FIRST = {}
-        for rule in self.analysis.rules:
-            a = rule.alias
-            self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
+        for rule in parser_conf.rules:
+            self.postprocess[rule] = getattr(parser_conf.callback, rule.alias)
             self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
 
-            self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin]
-
         self.term_matcher = term_matcher
 
 
     def parse(self, stream, start_symbol=None):
         # Define parser functions
-        start_symbol = start_symbol or self.start_symbol
+        start_symbol = start_symbol or self.parser_conf.start
 
         _Item = Item
         match = self.term_matcher
@@ -198,9 +187,8 @@ class Parser:
 
                 for item in to_reduce:
                     new_items = list(complete(item))
-                    for new_item in new_items:
-                        if new_item.similar(item):
-                            raise ParseError('Infinite recursion detected! (rule %s)' % new_item.rule)
+                    if item in new_items:
+                        raise ParseError('Infinite recursion detected! (rule %s)' % item.rule)
                     column.add(new_items)
 
         def scan(i, token, column):
@@ -252,24 +240,3 @@ class ApplyCallbacks(Transformer_NoRecurse):
             return callback(children)
         else:
             return Tree(rule.origin, children)
-
-# RULES = [
-#     ('a', ['d']),
-#     ('d', ['b']),
-#     ('b', ['C']),
-#     ('b', ['b', 'C']),
-#     ('b', ['C', 'b']),
-# ]
-# p = Parser(RULES, 'a')
-# for x in p.parse('CC'):
-#     print x.pretty()
-
-#---------------
-# RULES = [
-#     ('s', ['a', 'a']),
-#     ('a', ['b', 'b']),
-#     ('b', ['C'], lambda (x,): x),
-#     ('b', ['b', 'C']),
-# ]
-# p = Parser(RULES, 's', {})
-# print p.parse('CCCCC').pretty()
diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py
index 7390c58..a8c7757 100644
--- a/lark/parsers/grammar_analysis.py
+++ b/lark/parsers/grammar_analysis.py
@@ -94,13 +94,14 @@ def calculate_sets(rules):
 
 
 class GrammarAnalyzer(object):
-    def __init__(self, rules, start_symbol, debug=False):
+    def __init__(self, parser_conf, debug=False):
+        rules = parser_conf.rules
         assert len(rules) == len(set(rules))
 
-        self.start_symbol = start_symbol
+        self.start_symbol = parser_conf.start
         self.debug = debug
 
-        root_rule = Rule('$root', [start_symbol, '$END'])
+        root_rule = Rule('$root', [self.start_symbol, '$END'])
 
         self.rules_by_origin = {r.origin: [] for r in rules}
         for r in rules:
diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py
index 237619d..bc45d4e 100644
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -15,7 +15,7 @@ class Parser:
     def __init__(self, parser_conf):
         assert all(r.options is None or r.options.priority is None
                    for r in parser_conf.rules), "LALR doesn't yet support prioritization"
-        self.analysis = analysis = LALR_Analyzer(parser_conf.rules, parser_conf.start)
+        self.analysis = analysis = LALR_Analyzer(parser_conf)
         analysis.compute_lookahead()
         callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None)
                           for rule in analysis.rules}
diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py
index 055b26e..3cc67f3 100644
--- a/lark/parsers/xearley.py
+++ b/lark/parsers/xearley.py
@@ -28,31 +28,26 @@ from .grammar_analysis import GrammarAnalyzer
 from .earley import ApplyCallbacks, Item, Column
 
 class Parser:
-    def __init__(self, rules, start_symbol, callback, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False):
-        self.analysis = GrammarAnalyzer(rules, start_symbol)
-        self.start_symbol = start_symbol
+    def __init__(self,  parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False):
+        self.analysis = GrammarAnalyzer(parser_conf)
+        self.parser_conf = parser_conf
         self.resolve_ambiguity = resolve_ambiguity
         self.ignore = list(ignore)
         self.predict_all = predict_all
 
-
+        self.FIRST = self.analysis.FIRST
         self.postprocess = {}
         self.predictions = {}
-        self.FIRST = {}
-
-        for rule in self.analysis.rules:
-            a = rule.alias
-            self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
+        for rule in parser_conf.rules:
+            self.postprocess[rule] = getattr(parser_conf.callback, rule.alias)
             self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
 
-            self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin]
-
         self.term_matcher = term_matcher
 
 
     def parse(self, stream, start_symbol=None):
         # Define parser functions
-        start_symbol = start_symbol or self.start_symbol
+        start_symbol = start_symbol or self.parser_conf.start
         delayed_matches = defaultdict(list)
         match = self.term_matcher
 
@@ -79,9 +74,8 @@ class Parser:
                     column.add( predict(nonterm, column) )
                 for item in to_reduce:
                     new_items = list(complete(item))
-                    for new_item in new_items:
-                        if new_item.similar(item):
-                            raise ParseError('Infinite recursion detected! (rule %s)' % new_item.rule)
+                    if item in new_items:
+                        raise ParseError('Infinite recursion detected! (rule %s)' % item.rule)
                     column.add(new_items)
 
         def scan(i, token, column):

From 38c5fd244ab5c36a692a53ee9bf40881c60b5ac3 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Sun, 7 Jan 2018 17:20:07 +0200
Subject: [PATCH 04/21] Improved grammar validation and refactored the lexers

---
 lark/lexer.py              | 114 +++++++++++++++++--------------------
 lark/load_grammar.py       |   6 +-
 lark/parse_tree_builder.py |   2 +-
 lark/parsers/xearley.py    |   2 +-
 tests/test_parser.py       |  41 +++++++++++--
 5 files changed, 95 insertions(+), 70 deletions(-)

diff --git a/lark/lexer.py b/lark/lexer.py
index 66923b0..ba920c6 100644
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -111,12 +111,35 @@ def build_mres(tokens, match_whole=False):
     return _build_mres(tokens, len(tokens), match_whole)
 
 
-class Lexer(object):
+class LineCounter:
+    def __init__(self):
+        self.newline_char = '\n'
+        self.char_pos = 0
+        self.line = 1
+        self.column = 0
+        self.line_start_pos = 0
+
+    def feed(self, token, test_newline=True):
+        """Consume a token and calculat the new line & column.
+
+        As an optional optimization, set test_newline=False is token doesn't contain a newline.
+        """
+        if test_newline:
+            newlines = token.count(self.newline_char)
+            if newlines:
+                self.line += newlines
+                self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
+
+        self.char_pos += len(token)
+        self.column = self.char_pos - self.line_start_pos
+
+
+
+class Lexer:
     def __init__(self, tokens, ignore=()):
         assert all(isinstance(t, TokenDef) for t in tokens), tokens
 
         self.ignore = ignore
-        self.newline_char = '\n'
         tokens = list(tokens)
 
         # Sanitization
@@ -129,10 +152,7 @@ class Lexer(object):
             if t.pattern.min_width == 0:
                 raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))
 
-        token_names = {t.name for t in tokens}
-        for t in ignore:
-            if t not in token_names:
-                raise LexError("Token '%s' was marked to ignore but it is not defined!" % t)
+        assert set(ignore) <= {t.name for t in tokens}
 
         # Init
         self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
@@ -147,46 +167,8 @@ class Lexer(object):
 
         self.mres = build_mres(tokens)
 
-
     def lex(self, stream):
-        lex_pos = 0
-        line = 1
-        col_start_pos = 0
-        newline_types = list(self.newline_types)
-        ignore_types = list(self.ignore_types)
-        while True:
-            for mre, type_from_index in self.mres:
-                m = mre.match(stream, lex_pos)
-                if m:
-                    value = m.group(0)
-                    type_ = type_from_index[m.lastindex]
-                    to_yield = type_ not in ignore_types
-
-                    if to_yield:
-                        t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
-                        end_col = t.column + len(value)
-                        if t.type in self.callback:
-                            t = self.callback[t.type](t)
-
-                    if type_ in newline_types:
-                        newlines = value.count(self.newline_char)
-                        if newlines:
-                            line += newlines
-                            last_newline_index = value.rindex(self.newline_char) + 1
-                            col_start_pos = lex_pos + last_newline_index
-                            end_col = len(value) - last_newline_index
-
-                    if to_yield:
-                        t.end_line = line
-                        t.end_col = end_col
-                        yield t
-
-                    lex_pos += len(value)
-                    break
-            else:
-                if lex_pos < len(stream):
-                    raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
-                break
+        return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
 
 
 class ContextualLexer:
@@ -218,33 +200,39 @@ class ContextualLexer:
         self.parser_state = state
 
     def lex(self, stream):
-        lex_pos = 0
-        line = 1
-        col_start_pos = 0
-        newline_types = list(self.root_lexer.newline_types)
-        ignore_types = list(self.root_lexer.ignore_types)
+        l = _Lex(self.lexers[self.parser_state])
+        for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
+            yield x
+            l.lexer = self.lexers[self.parser_state]
+
+
+class _Lex:
+    "Built to serve both Lexer and ContextualLexer"
+    def __init__(self, lexer):
+        self.lexer = lexer
+
+    def lex(self, stream, newline_types, ignore_types):
+        newline_types = list(newline_types)
+        newline_types = list(newline_types)
+        line_ctr = LineCounter()
+
         while True:
-            lexer = self.lexers[self.parser_state]
+            lexer = self.lexer
             for mre, type_from_index in lexer.mres:
-                m = mre.match(stream, lex_pos)
+                m = mre.match(stream, line_ctr.char_pos)
                 if m:
                     value = m.group(0)
                     type_ = type_from_index[m.lastindex]
                     if type_ not in ignore_types:
-                        t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
+                        t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
                         if t.type in lexer.callback:
                             t = lexer.callback[t.type](t)
-                        yield t
-
-                    if type_ in newline_types:
-                        newlines = value.count(lexer.newline_char)
-                        if newlines:
-                            line += newlines
-                            col_start_pos = lex_pos + value.rindex(lexer.newline_char)
-                    lex_pos += len(value)
+                        lexer = yield t
+
+                    line_ctr.feed(value, type_ in newline_types)
                     break
             else:
-                if lex_pos < len(stream):
-                    raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos, lexer.tokens)
+                if line_ctr.char_pos < len(stream):
+                    raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
                 break
 
diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index 72e2e22..7726845 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -411,6 +411,7 @@ class Grammar:
         terms_to_ignore = {name:'__'+name for name in self.ignore}
         if terms_to_ignore:
             assert set(terms_to_ignore) <= {name for name, _t in term_defs}
+
             term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs]
             expr = Token('RULE', '__ignore')
             for r, tree, _o in rule_defs:
@@ -562,6 +563,7 @@ class GrammarLoader:
         d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules}
         rules, callback = ParseTreeBuilder(d, T).apply()
         lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])
+
         parser_conf = ParserConf(rules, callback, 'start')
         self.parser = LALR(lexer_conf, parser_conf)
 
@@ -636,7 +638,6 @@ class GrammarLoader:
             ignore_names.append(name)
             token_defs.append((name, (t, 0)))
 
-
         # Verify correctness 2
         token_names = set()
         for name, _ in token_defs:
@@ -644,6 +645,9 @@ class GrammarLoader:
                 raise GrammarError("Token '%s' defined more than once" % name)
             token_names.add(name)
 
+        if set(ignore_names) > token_names:
+            raise GrammarError("Tokens %s were marked to ignore but were not defined!" % (set(ignore_names) - token_names))
+
         # Resolve token references
         resolve_token_references(token_defs)
 
diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py
index 497af55..e26d287 100644
--- a/lark/parse_tree_builder.py
+++ b/lark/parse_tree_builder.py
@@ -121,7 +121,7 @@ class ParseTreeBuilder:
 
             for expansion, alias in expansions:
                 if alias and origin.startswith('_'):
-                        raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias))
+                    raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias))
 
                 wrapper_chain = filter(None, [
                     (expand1 and not alias) and Expand1,
diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py
index 3cc67f3..420c469 100644
--- a/lark/parsers/xearley.py
+++ b/lark/parsers/xearley.py
@@ -127,7 +127,7 @@ class Parser:
 
             if token == '\n':
                 text_line += 1
-                text_column = 1
+                text_column = 0
             else:
                 text_column += 1
 
diff --git a/tests/test_parser.py b/tests/test_parser.py
index d93e33b..db28834 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -126,7 +126,7 @@ class TestParsers(unittest.TestCase):
         r = T().transform(g.parse("x"))
         self.assertEqual( r.children, ["<b>"] )
 
-            
+
         g = Lark("""start: a
                     ?a : b
                     b : "x"
@@ -142,14 +142,14 @@ class TestParsers(unittest.TestCase):
         r = T().transform(g.parse("xx"))
         self.assertEqual( r.children, ["<c>"] )
 
-            
+
         g = Lark("""start: a
                     ?a : b b -> c
                     b : "x"
                  """, parser='lalr', transformer=T())
         r = g.parse("xx")
         self.assertEqual( r.children, ["<c>"] )
- 
+
 
 
 
@@ -796,6 +796,39 @@ def _make_parser_test(LEXER, PARSER):
             self.assertEqual(tree.children, ['a', 'A'])
 
 
+        def test_undefined_ignore(self):
+            g = """!start: "A"
+
+                %ignore B
+                """
+            self.assertRaises( GrammarError, _Lark, g)
+
+        @unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO
+        def test_line_and_column(self):
+            g = r"""!start: "A" bc "D"
+                !bc: "B\nC"
+                """
+            l = _Lark(g)
+            a, bc, d = l.parse("AB\nCD").children
+            self.assertEqual(a.line, 1)
+            self.assertEqual(a.column, 0)
+
+            bc ,= bc.children
+            self.assertEqual(bc.line, 1)
+            self.assertEqual(bc.column, 1)
+
+            self.assertEqual(d.line, 2)
+            self.assertEqual(d.column, 1)
+
+            # self.assertEqual(a.end_line, 1)
+            # self.assertEqual(a.end_col, 1)
+            # self.assertEqual(bc.end_line, 2)
+            # self.assertEqual(bc.end_col, 1)
+            # self.assertEqual(d.end_line, 2)
+            # self.assertEqual(d.end_col, 2)
+
+
+
         def test_reduce_cycle(self):
             """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
             It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
@@ -969,7 +1002,7 @@ def _make_parser_test(LEXER, PARSER):
 
             parser = _Lark(grammar)
 
-            tree = parser.parse("int 1 ! This is a comment\n")      
+            tree = parser.parse("int 1 ! This is a comment\n")
             self.assertEqual(tree.children, ['1'])
 
             tree = parser.parse("int 1 ! This is a comment")    # A trailing ignore token can be tricky!

From 7182ba399136bf2c0f1f74d6652e60ffeb55d448 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Sun, 7 Jan 2018 22:33:37 +0200
Subject: [PATCH 05/21] Minor refactoring for the standalone tool (in progress)

---
 lark/grammar.py               | 38 ++++++++++++++++++++++++-
 lark/lexer.py                 | 52 +++++++++++++++++------------------
 lark/load_grammar.py          | 28 +------------------
 lark/parsers/lalr_analysis.py |  8 ++++--
 lark/parsers/lalr_parser.py   |  5 +---
 5 files changed, 70 insertions(+), 61 deletions(-)

diff --git a/lark/grammar.py b/lark/grammar.py
index 281c21c..f853182 100644
--- a/lark/grammar.py
+++ b/lark/grammar.py
@@ -10,7 +10,43 @@ class Rule(object):
         self.alias = alias
         self.options = options
 
-    def __repr__(self):
+    def __str__(self):
         return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion)))
 
+    def __repr__(self):
+        return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options)
+
+
+class RuleOptions:
+    def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None):
+        self.keep_all_tokens = keep_all_tokens
+        self.expand1 = expand1
+        self.create_token = create_token  # used for scanless postprocessing
+        self.priority = priority
 
+        self.filter_out = filter_out        # remove this rule from the tree
+                                            # used for "token"-rules in scanless
+    @classmethod
+    def from_rule(cls, name, *x):
+        if len(x) > 1:
+            priority, expansions = x
+            priority = int(priority)
+        else:
+            expansions ,= x
+            priority = None
+
+        keep_all_tokens = name.startswith('!')
+        name = name.lstrip('!')
+        expand1 = name.startswith('?')
+        name = name.lstrip('?')
+
+        return name, expansions, cls(keep_all_tokens, expand1, priority=priority)
+
+    def __repr__(self):
+        return 'RuleOptions(%r, %r, %r, %r, %r)' % (
+            self.keep_all_tokens,
+            self.expand1,
+            self.create_token,
+            self.priority,
+            self.filter_out
+        )
diff --git a/lark/lexer.py b/lark/lexer.py
index ba920c6..5ca77de 100644
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -111,35 +111,11 @@ def build_mres(tokens, match_whole=False):
     return _build_mres(tokens, len(tokens), match_whole)
 
 
-class LineCounter:
-    def __init__(self):
-        self.newline_char = '\n'
-        self.char_pos = 0
-        self.line = 1
-        self.column = 0
-        self.line_start_pos = 0
-
-    def feed(self, token, test_newline=True):
-        """Consume a token and calculat the new line & column.
-
-        As an optional optimization, set test_newline=False is token doesn't contain a newline.
-        """
-        if test_newline:
-            newlines = token.count(self.newline_char)
-            if newlines:
-                self.line += newlines
-                self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
-
-        self.char_pos += len(token)
-        self.column = self.char_pos - self.line_start_pos
-
-
 
 class Lexer:
     def __init__(self, tokens, ignore=()):
         assert all(isinstance(t, TokenDef) for t in tokens), tokens
 
-        self.ignore = ignore
         tokens = list(tokens)
 
         # Sanitization
@@ -156,7 +132,7 @@ class Lexer:
 
         # Init
         self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
-        self.ignore_types = [t for t in ignore]
+        self.ignore_types = list(ignore)
 
         tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
 
@@ -206,6 +182,30 @@ class ContextualLexer:
             l.lexer = self.lexers[self.parser_state]
 
 
+###{lexer
+
+class LineCounter:
+    def __init__(self):
+        self.newline_char = '\n'
+        self.char_pos = 0
+        self.line = 1
+        self.column = 0
+        self.line_start_pos = 0
+
+    def feed(self, token, test_newline=True):
+        """Consume a token and calculate the new line & column.
+
+        As an optional optimization, set test_newline=False is token doesn't contain a newline.
+        """
+        if test_newline:
+            newlines = token.count(self.newline_char)
+            if newlines:
+                self.line += newlines
+                self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
+
+        self.char_pos += len(token)
+        self.column = self.char_pos - self.line_start_pos
+
 class _Lex:
     "Built to serve both Lexer and ContextualLexer"
     def __init__(self, lexer):
@@ -235,4 +235,4 @@ class _Lex:
                 if line_ctr.char_pos < len(stream):
                     raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
                 break
-
+###}
diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index 7726845..ce4ec5a 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -12,6 +12,7 @@ from .parse_tree_builder import ParseTreeBuilder
 from .parser_frontends import LALR
 from .parsers.lalr_parser import UnexpectedToken
 from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef
+from .grammar import RuleOptions
 
 from .tree import Tree as T, Transformer, InlineTransformer, Visitor
 
@@ -494,33 +495,6 @@ class Grammar:
 
 
 
-class RuleOptions:
-    def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None):
-        self.keep_all_tokens = keep_all_tokens
-        self.expand1 = expand1
-        self.create_token = create_token  # used for scanless postprocessing
-        self.priority = priority
-
-        self.filter_out = filter_out        # remove this rule from the tree
-                                            # used for "token"-rules in scanless
-    @classmethod
-    def from_rule(cls, name, *x):
-        if len(x) > 1:
-            priority, expansions = x
-            priority = int(priority)
-        else:
-            expansions ,= x
-            priority = None
-
-        keep_all_tokens = name.startswith('!')
-        name = name.lstrip('!')
-        expand1 = name.startswith('?')
-        name = name.lstrip('?')
-
-        return name, expansions, cls(keep_all_tokens, expand1, priority=priority)
-
-
-
 _imported_grammars = {}
 def import_grammar(grammar_path):
     if grammar_path not in _imported_grammars:
diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py
index 2c9e8a4..6eb3fdf 100644
--- a/lark/parsers/lalr_analysis.py
+++ b/lark/parsers/lalr_analysis.py
@@ -15,13 +15,15 @@ from ..common import GrammarError, is_terminal
 from .grammar_analysis import GrammarAnalyzer
 
 class Action:
+    def __init__(self, name):
+        self.name = name
     def __str__(self):
-        return self.__name__
+        return self.name
     def __repr__(self):
         return str(self)
 
-class Shift(Action): pass
-class Reduce(Action): pass
+Shift = Action('Shift')
+Reduce = Action('Reduce')
 
 class ParseTable:
     def __init__(self, states, start_state, end_state):
diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py
index bc45d4e..b093990 100644
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -7,10 +7,6 @@ from ..common import ParseError, UnexpectedToken
 
 from .lalr_analysis import LALR_Analyzer, Shift
 
-class FinalReduce:
-    def __init__(self, value):
-        self.value = value
-
 class Parser:
     def __init__(self, parser_conf):
         assert all(r.options is None or r.options.priority is None
@@ -20,6 +16,7 @@ class Parser:
         callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None)
                           for rule in analysis.rules}
 
+        self.parser_conf = parser_conf
         self.parser = _Parser(analysis.parse_table, callbacks)
         self.parse = self.parser.parse
 

From e072d91760b4acfb7773389f44c001b66b6221f2 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Tue, 9 Jan 2018 17:00:53 +0200
Subject: [PATCH 06/21] Updated README

---
 README.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 542977f..62a645f 100644
--- a/README.md
+++ b/README.md
@@ -66,8 +66,8 @@ See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples)
 
  - Builds a parse-tree (AST) automagically, based on the structure of the grammar
  - **Earley** parser
-    - Can parse *ALL* context-free grammars
-    - Full support for ambiguity in grammar
+    - Can parse all context-free grammars
+    - Full support for ambiguous grammars
  - **LALR(1)** parser
     - Competitive with PLY
  - **EBNF** grammar
@@ -86,7 +86,7 @@ See the full list of [features in the wiki](https://github.com/erezsh/lark/wiki/
 
 #### Performance comparison
 
-Lower is better!
+Lark is the fastest and lightest (lower is better)
 
 ![Run-time Comparison](docs/comparison_runtime.png)
 
@@ -99,14 +99,14 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail
 
 #### Feature comparison
 
-| Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG?
+| Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG? | Line/Column tracking |
 |:--------|:----------|:----|:--------|:------------|:------------
-| **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! |
-| [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No |
-| [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* |
-| [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* |
-| [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No |
-| [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* |
+| **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | Yes! |
+| [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | No |
+| [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | No |
+| [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | No |
+| [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No | No |
+| [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | No |
 
 
 (\* *According to Wikipedia, it remains unanswered whether PEGs can really parse all deterministic CFGs*)

From 401833536888289e3215346d275cae6aa5d5dc15 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erez27+git@gmail.com>
Date: Tue, 9 Jan 2018 17:02:06 +0200
Subject: [PATCH 07/21] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 62a645f..57ac62c 100644
--- a/README.md
+++ b/README.md
@@ -100,7 +100,7 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail
 #### Feature comparison
 
 | Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG? | Line/Column tracking |
-|:--------|:----------|:----|:--------|:------------|:------------
+|:--------|:----------|:----|:--------|:------------|:------------|:----------
 | **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | Yes! |
 | [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | No |
 | [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | No |

From 07b5469e8616567d85108dd237e3065b8c1e87c2 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Tue, 9 Jan 2018 19:16:07 +0200
Subject: [PATCH 08/21] More refactoring, untangling grammar compilation and
 parse-tree creation

---
 lark/grammar.py            | 15 -----------
 lark/lark.py               |  5 ++--
 lark/load_grammar.py       | 40 +++++++++++++++++++++++-------
 lark/parse_tree_builder.py | 51 ++++++++++++++++++--------------------
 4 files changed, 58 insertions(+), 53 deletions(-)

diff --git a/lark/grammar.py b/lark/grammar.py
index f853182..d257bc4 100644
--- a/lark/grammar.py
+++ b/lark/grammar.py
@@ -26,21 +26,6 @@ class RuleOptions:
 
         self.filter_out = filter_out        # remove this rule from the tree
                                             # used for "token"-rules in scanless
-    @classmethod
-    def from_rule(cls, name, *x):
-        if len(x) > 1:
-            priority, expansions = x
-            priority = int(priority)
-        else:
-            expansions ,= x
-            priority = None
-
-        keep_all_tokens = name.startswith('!')
-        name = name.lstrip('!')
-        expand1 = name.startswith('?')
-        name = name.lstrip('?')
-
-        return name, expansions, cls(keep_all_tokens, expand1, priority=priority)
 
     def __repr__(self):
         return 'RuleOptions(%r, %r, %r, %r, %r)' % (
diff --git a/lark/lark.py b/lark/lark.py
index 03bd253..a7af772 100644
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -165,14 +165,15 @@ class Lark:
 
     def _build_parser(self):
         self.parser_class = get_frontend(self.options.parser, self.options.lexer)
+
         self.parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens)
-        rules, callback = self.parse_tree_builder.apply(self.options.transformer)
+        callback = self.parse_tree_builder.apply(self.options.transformer)
         if self.profiler:
             for f in dir(callback):
                 if not (f.startswith('__') and f.endswith('__')):
                     setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f)))
 
-        parser_conf = ParserConf(rules, callback, self.options.start)
+        parser_conf = ParserConf(self.rules, callback, self.options.start)
 
         return self.parser_class(self.lexer_conf, parser_conf, options=self.options)
 
diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index ce4ec5a..b38a67c 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -12,7 +12,7 @@ from .parse_tree_builder import ParseTreeBuilder
 from .parser_frontends import LALR
 from .parsers.lalr_parser import UnexpectedToken
 from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef
-from .grammar import RuleOptions
+from .grammar import RuleOptions, Rule
 
 from .tree import Tree as T, Transformer, InlineTransformer, Visitor
 
@@ -485,13 +485,21 @@ class Grammar:
 
         dict_update_safe(rules, ebnf_to_bnf.new_rules)
 
-        for tree, _o in rules.values():
+        rule_tree_to_text = RuleTreeToText()
+
+        new_rules = []
+        for origin, (tree, options) in rules.items():
             simplify_rule.visit(tree)
+            expansions = rule_tree_to_text.transform(tree)
 
-        rule_tree_to_text = RuleTreeToText()
-        rules = {origin: (rule_tree_to_text.transform(tree), options) for origin, (tree, options) in rules.items()}
+            for expansion, alias in expansions:
+                if alias and origin.startswith('_'):
+                    raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias))
 
-        return tokens, rules, self.ignore
+                rule = Rule(origin, expansion, alias, options)
+                new_rules.append(rule)
+
+        return tokens, new_rules, self.ignore
 
 
 
@@ -528,14 +536,28 @@ def resolve_token_references(token_defs):
         if not changed:
             break
 
+def options_from_rule(name, *x):
+    if len(x) > 1:
+        priority, expansions = x
+        priority = int(priority)
+    else:
+        expansions ,= x
+        priority = None
+
+    keep_all_tokens = name.startswith('!')
+    name = name.lstrip('!')
+    expand1 = name.startswith('?')
+    name = name.lstrip('?')
+
+    return name, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority)
 
 class GrammarLoader:
     def __init__(self):
         tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()]
 
-        rules = [RuleOptions.from_rule(name, x) for name, x in RULES.items()]
-        d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules}
-        rules, callback = ParseTreeBuilder(d, T).apply()
+        rules = [options_from_rule(name, x) for name, x in RULES.items()]
+        rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs]
+        callback = ParseTreeBuilder(rules, T).apply()
         lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])
 
         parser_conf = ParserConf(rules, callback, 'start')
@@ -625,7 +647,7 @@ class GrammarLoader:
         # Resolve token references
         resolve_token_references(token_defs)
 
-        rules = [RuleOptions.from_rule(*x) for x in rule_defs]
+        rules = [options_from_rule(*x) for x in rule_defs]
 
         rule_names = set()
         for name, _x, _o in rules:
diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py
index e26d287..4513583 100644
--- a/lark/parse_tree_builder.py
+++ b/lark/parse_tree_builder.py
@@ -109,49 +109,46 @@ class ParseTreeBuilder:
 
     def _init_builders(self, rules):
         filter_out = set()
-        for origin, (expansions, options) in rules.items():
-            if options and options.filter_out:
-                assert origin.startswith('_')   # Just to make sure
-                filter_out.add(origin)
+        for rule in rules:
+            if rule.options and rule.options.filter_out:
+                assert rule.origin.startswith('_')   # Just to make sure
+                filter_out.add(rule.origin)
 
-        for origin, (expansions, options) in rules.items():
+        for rule in rules:
+            options = rule.options
             keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False)
             expand1 = options.expand1 if options else False
             create_token = options.create_token if options else False
 
-            for expansion, alias in expansions:
-                if alias and origin.startswith('_'):
-                    raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias))
+            wrapper_chain = filter(None, [
+                (expand1 and not rule.alias) and Expand1,
+                create_token and Factory(TokenWrapper, create_token),
+                create_rule_handler(rule.expansion, keep_all_tokens, filter_out),
+                self.propagate_positions and PropagatePositions,
+            ])
 
-                wrapper_chain = filter(None, [
-                    (expand1 and not alias) and Expand1,
-                    create_token and Factory(TokenWrapper, create_token),
-                    create_rule_handler(expansion, keep_all_tokens, filter_out),
-                    self.propagate_positions and PropagatePositions,
-                ])
-
-                yield origin, expansion, options, alias or origin, wrapper_chain
+            yield rule, wrapper_chain
 
 
     def apply(self, transformer=None):
         callback = Callback()
 
-        new_rules = []
-        for origin, expansion, options, alias, wrapper_chain in self.rule_builders:
-            callback_name = '_callback_%s_%s' % (origin, '_'.join(expansion))
+        for rule, wrapper_chain in self.rule_builders:
+            internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion))
 
+            user_callback_name = rule.alias or rule.origin
             try:
-                f = transformer._get_func(alias)
+                f = transformer._get_func(user_callback_name)
             except AttributeError:
-                f = NodeBuilder(self.tree_class, alias)
+                f = NodeBuilder(self.tree_class, user_callback_name)
+
+            rule.alias = internal_callback_name
 
             for w in wrapper_chain:
                 f = w(f)
 
-            if hasattr(callback, callback_name):
-                raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin))
-            setattr(callback, callback_name, f)
-
-            new_rules.append( Rule( origin, expansion, callback_name, options ))
+            if hasattr(callback, internal_callback_name):
+                raise GrammarError("Rule '%s' already exists" % (rule,))
+            setattr(callback, internal_callback_name, f)
 
-        return new_rules, callback
+        return callback

From da1910f5b67b56528974aa9996abd46a103a37f2 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Tue, 9 Jan 2018 21:08:40 +0200
Subject: [PATCH 09/21] More refactoring towards standalone

---
 lark/common.py              |   9 ++-
 lark/lark.py                |   4 +-
 lark/lexer.py               | 113 ++++++++++++++++++------------------
 lark/load_grammar.py        |  42 +++++++-------
 lark/parse_tree_builder.py  |   6 +-
 lark/parsers/lalr_parser.py |   6 +-
 lark/tree.py                |   4 +-
 7 files changed, 96 insertions(+), 88 deletions(-)

diff --git a/lark/common.py b/lark/common.py
index 800aa4f..ff1897a 100644
--- a/lark/common.py
+++ b/lark/common.py
@@ -4,12 +4,18 @@ import sys
 
 Py36 = (sys.version_info[:2] >= (3, 6))
 
+
+###{standalone
+def is_terminal(sym):
+    return sym.isupper()
+
 class GrammarError(Exception):
     pass
 
 class ParseError(Exception):
     pass
 
+###}
 
 class UnexpectedToken(ParseError):
     def __init__(self, token, expected, seq, index):
@@ -32,9 +38,6 @@ class UnexpectedToken(ParseError):
 
 
 
-def is_terminal(sym):
-    return sym.isupper()
-
 
 class LexerConf:
     def __init__(self, tokens, ignore=(), postlex=None):
diff --git a/lark/lark.py b/lark/lark.py
index a7af772..58a6ff7 100644
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -166,8 +166,8 @@ class Lark:
     def _build_parser(self):
         self.parser_class = get_frontend(self.options.parser, self.options.lexer)
 
-        self.parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens)
-        callback = self.parse_tree_builder.apply(self.options.transformer)
+        self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens)
+        callback = self._parse_tree_builder.create_callback(self.options.transformer)
         if self.profiler:
             for f in dir(callback):
                 if not (f.startswith('__') and f.endswith('__')):
diff --git a/lark/lexer.py b/lark/lexer.py
index 5ca77de..4f673f6 100644
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -5,6 +5,7 @@ import re
 from .utils import Str, classify
 from .common import is_terminal, PatternStr, PatternRE, TokenDef
 
+###{standalone
 class LexError(Exception):
     pass
 
@@ -48,10 +49,60 @@ class Token(Str):
 
     __hash__ = Str.__hash__
 
-class Regex:
-    def __init__(self, pattern, flags=()):
-        self.pattern = pattern
-        self.flags = flags
+
+class LineCounter:
+    def __init__(self):
+        self.newline_char = '\n'
+        self.char_pos = 0
+        self.line = 1
+        self.column = 0
+        self.line_start_pos = 0
+
+    def feed(self, token, test_newline=True):
+        """Consume a token and calculate the new line & column.
+
+        As an optional optimization, set test_newline=False is token doesn't contain a newline.
+        """
+        if test_newline:
+            newlines = token.count(self.newline_char)
+            if newlines:
+                self.line += newlines
+                self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
+
+        self.char_pos += len(token)
+        self.column = self.char_pos - self.line_start_pos
+
+class _Lex:
+    "Built to serve both Lexer and ContextualLexer"
+    def __init__(self, lexer):
+        self.lexer = lexer
+
+    def lex(self, stream, newline_types, ignore_types):
+        newline_types = list(newline_types)
+        newline_types = list(newline_types)
+        line_ctr = LineCounter()
+
+        while True:
+            lexer = self.lexer
+            for mre, type_from_index in lexer.mres:
+                m = mre.match(stream, line_ctr.char_pos)
+                if m:
+                    value = m.group(0)
+                    type_ = type_from_index[m.lastindex]
+                    if type_ not in ignore_types:
+                        t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
+                        if t.type in lexer.callback:
+                            t = lexer.callback[t.type](t)
+                        lexer = yield t
+
+                    line_ctr.feed(value, type_ in newline_types)
+                    break
+            else:
+                if line_ctr.char_pos < len(stream):
+                    raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
+                break
+###}
+
 
 def _regexp_has_newline(r):
     return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)
@@ -182,57 +233,3 @@ class ContextualLexer:
             l.lexer = self.lexers[self.parser_state]
 
 
-###{lexer
-
-class LineCounter:
-    def __init__(self):
-        self.newline_char = '\n'
-        self.char_pos = 0
-        self.line = 1
-        self.column = 0
-        self.line_start_pos = 0
-
-    def feed(self, token, test_newline=True):
-        """Consume a token and calculate the new line & column.
-
-        As an optional optimization, set test_newline=False is token doesn't contain a newline.
-        """
-        if test_newline:
-            newlines = token.count(self.newline_char)
-            if newlines:
-                self.line += newlines
-                self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
-
-        self.char_pos += len(token)
-        self.column = self.char_pos - self.line_start_pos
-
-class _Lex:
-    "Built to serve both Lexer and ContextualLexer"
-    def __init__(self, lexer):
-        self.lexer = lexer
-
-    def lex(self, stream, newline_types, ignore_types):
-        newline_types = list(newline_types)
-        newline_types = list(newline_types)
-        line_ctr = LineCounter()
-
-        while True:
-            lexer = self.lexer
-            for mre, type_from_index in lexer.mres:
-                m = mre.match(stream, line_ctr.char_pos)
-                if m:
-                    value = m.group(0)
-                    type_ = type_from_index[m.lastindex]
-                    if type_ not in ignore_types:
-                        t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
-                        if t.type in lexer.callback:
-                            t = lexer.callback[t.type](t)
-                        lexer = yield t
-
-                    line_ctr.feed(value, type_ in newline_types)
-                    break
-            else:
-                if line_ctr.char_pos < len(stream):
-                    raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
-                break
-###}
diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index b38a67c..2086591 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -128,7 +128,7 @@ RULES = {
 
 class EBNF_to_BNF(InlineTransformer):
     def __init__(self):
-        self.new_rules = {}
+        self.new_rules = []
         self.rules_by_expr = {}
         self.prefix = 'anon'
         self.i = 0
@@ -141,7 +141,8 @@ class EBNF_to_BNF(InlineTransformer):
         new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
         self.i += 1
         t = Token('RULE', new_name, -1)
-        self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]), self.rule_options
+        tree = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
+        self.new_rules.append((new_name, tree, self.rule_options))
         self.rules_by_expr[expr] = t
         return t
 
@@ -390,12 +391,6 @@ def _interleave(l, item):
 def _choice_of_rules(rules):
     return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules])
 
-def dict_update_safe(d1, d2):
-    for k, v in d2.items():
-        assert k not in d1
-        d1[k] = v
-
-
 class Grammar:
     def __init__(self, rule_defs, token_defs, ignore):
         self.token_defs = token_defs
@@ -468,38 +463,41 @@ class Grammar:
         # =================
         #  Compile Rules
         # =================
-        ebnf_to_bnf = EBNF_to_BNF()
-        simplify_rule = SimplifyRule_Visitor()
 
+        # 1. Pre-process terminals
         transformer = PrepareLiterals()
         if not lexer:
             transformer *= SplitLiterals()
         transformer *= ExtractAnonTokens(tokens)   # Adds to tokens
 
-        rules = {}
+        # 2. Convert EBNF to BNF (and apply step 1)
+        ebnf_to_bnf = EBNF_to_BNF()
+        rules = []
         for name, rule_tree, options in rule_defs:
-            assert name not in rules, name
             ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
             tree = transformer.transform(rule_tree)
-            rules[name] = ebnf_to_bnf.transform(tree), options
+            rules.append((name, ebnf_to_bnf.transform(tree), options))
+        rules += ebnf_to_bnf.new_rules
 
-        dict_update_safe(rules, ebnf_to_bnf.new_rules)
+        assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision"
 
+        # 3. Compile tree to Rule objects
         rule_tree_to_text = RuleTreeToText()
 
-        new_rules = []
-        for origin, (tree, options) in rules.items():
+        simplify_rule = SimplifyRule_Visitor()
+        compiled_rules = []
+        for name, tree, options in rules:
             simplify_rule.visit(tree)
             expansions = rule_tree_to_text.transform(tree)
 
             for expansion, alias in expansions:
-                if alias and origin.startswith('_'):
-                    raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias))
+                if alias and name.startswith('_'):
+                    raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias))
 
-                rule = Rule(origin, expansion, alias, options)
-                new_rules.append(rule)
+                rule = Rule(name, expansion, alias, options)
+                compiled_rules.append(rule)
 
-        return tokens, new_rules, self.ignore
+        return tokens, compiled_rules, self.ignore
 
 
 
@@ -557,7 +555,7 @@ class GrammarLoader:
 
         rules = [options_from_rule(name, x) for name, x in RULES.items()]
         rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs]
-        callback = ParseTreeBuilder(rules, T).apply()
+        callback = ParseTreeBuilder(rules, T).create_callback()
         lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])
 
         parser_conf = ParserConf(rules, callback, 'start')
diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py
index 4513583..f960931 100644
--- a/lark/parse_tree_builder.py
+++ b/lark/parse_tree_builder.py
@@ -3,6 +3,8 @@ from .utils import suppress
 from .lexer import Token
 from .grammar import Rule
 
+###{standalone
+
 class NodeBuilder:
     def __init__(self, tree_class, name):
         self.tree_class = tree_class
@@ -130,7 +132,7 @@ class ParseTreeBuilder:
             yield rule, wrapper_chain
 
 
-    def apply(self, transformer=None):
+    def create_callback(self, transformer=None):
         callback = Callback()
 
         for rule, wrapper_chain in self.rule_builders:
@@ -152,3 +154,5 @@ class ParseTreeBuilder:
             setattr(callback, internal_callback_name, f)
 
         return callback
+
+###}
diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py
index b093990..eafc4ea 100644
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -3,7 +3,7 @@
 # Author: Erez Shinan (2017)
 # Email : erezshin@gmail.com
 
-from ..common import ParseError, UnexpectedToken
+from ..common import UnexpectedToken
 
 from .lalr_analysis import LALR_Analyzer, Shift
 
@@ -20,6 +20,8 @@ class Parser:
         self.parser = _Parser(analysis.parse_table, callbacks)
         self.parse = self.parser.parse
 
+###{standalone
+
 class _Parser:
     def __init__(self, parse_table, callbacks):
         self.states = parse_table.states
@@ -90,3 +92,5 @@ class _Parser:
                 return val
             else:
                 reduce(arg)
+
+###}
diff --git a/lark/tree.py b/lark/tree.py
index f832857..1639bb1 100644
--- a/lark/tree.py
+++ b/lark/tree.py
@@ -7,6 +7,7 @@ from copy import deepcopy
 
 from .utils import inline_args
 
+###{standalone
 class Tree(object):
     def __init__(self, data, children):
         self.data = data
@@ -33,6 +34,7 @@ class Tree(object):
 
     def pretty(self, indent_str='  '):
         return ''.join(self._pretty(0, indent_str))
+###}
 
     def expand_kids_by_index(self, *indices):
         for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices
@@ -138,7 +140,7 @@ class TransformerChain(object):
 
     def __mul__(self, other):
         return TransformerChain(*self.transformers + (other,))
-        
+
 
 
 class InlineTransformer(Transformer):

From 5ac4120b71d9481eccba04e7c9a746c50be38fdc Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Wed, 10 Jan 2018 00:50:12 +0200
Subject: [PATCH 10/21] Stand-alone tool working for LALR+traditional lexer
 (first commit)

---
 lark/parse_tree_builder.py |   3 +
 lark/tools/standalone.py   | 184 +++++++++++++++++++++++++++++++++++++
 lark/tree.py               |   2 +
 lark/utils.py              |  36 ++++----
 4 files changed, 208 insertions(+), 17 deletions(-)
 create mode 100644 lark/tools/standalone.py

diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py
index f960931..7e52125 100644
--- a/lark/parse_tree_builder.py
+++ b/lark/parse_tree_builder.py
@@ -109,6 +109,8 @@ class ParseTreeBuilder:
 
         self.rule_builders = list(self._init_builders(rules))
 
+        self.user_aliases = {}
+
     def _init_builders(self, rules):
         filter_out = set()
         for rule in rules:
@@ -144,6 +146,7 @@ class ParseTreeBuilder:
             except AttributeError:
                 f = NodeBuilder(self.tree_class, user_callback_name)
 
+            self.user_aliases[rule] = rule.alias
             rule.alias = internal_callback_name
 
             for w in wrapper_chain:
diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py
new file mode 100644
index 0000000..54dc69a
--- /dev/null
+++ b/lark/tools/standalone.py
@@ -0,0 +1,184 @@
+###{standalone
+#
+#
+#   Lark Stand-alone Generator Tool
+# ----------------------------------
+# Git:    https://github.com/erezsh/lark
+# Author: Erez Shinan (erezshin@gmail.com)
+#
+#
+#    >>> LICENSE
+#
+#    This tool and its generated code use a separate license from Lark.
+#
+#    It is licensed under GPLv2 or above.
+#
+#    If you wish to purchase a commercial license for this tool and its
+#    generated code, contact me via email.
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 2 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    See <http://www.gnu.org/licenses/>.
+#
+#
+###}
+
+import codecs
+import sys
+import os
+from pprint import pprint
+from os import path
+from collections import defaultdict
+
+import lark
+from lark import Lark
+
+from ..grammar import Rule
+
+__dir__ = path.dirname(__file__)
+__larkdir__ = path.join(__dir__, path.pardir)
+
+
+EXTRACT_STANDALONE_FILES = [
+    'tools/standalone.py',
+    'utils.py',
+    'common.py',
+    'tree.py',
+    'lexer.py',
+    'parse_tree_builder.py',
+    'parsers/lalr_parser.py',
+]
+
+
+def extract_sections(lines):
+    section = None
+    text = []
+    sections = defaultdict(list)
+    for l in lines:
+        if l.startswith('###'):
+            if l[3] == '{':
+                section = l[4:].strip()
+            elif l[3] == '}':
+                sections[section] += text
+                section = None
+                text = []
+            else:
+                raise ValueError(l)
+        elif section:
+            text.append(l)
+
+    return {name:''.join(text) for name, text in sections.items()}
+
+class LexerAtoms:
+    def __init__(self, lexer):
+        assert not lexer.callback
+        self.mres = [(p.pattern,d) for p,d in lexer.mres]
+        self.newline_types = lexer.newline_types
+        self.ignore_types = lexer.ignore_types
+
+    def print_python(self):
+        print('import re')
+        print('MRES = (')
+        pprint(self.mres)
+        print(')')
+        print('NEWLINE_TYPES = %s' % self.newline_types)
+        print('IGNORE_TYPES = %s' % self.ignore_types)
+        print('class LexerRegexps: pass')
+        print('lexer_regexps = LexerRegexps()')
+        print('lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES]')
+        print('lexer_regexps.callback = {}')
+        print('lexer = _Lex(lexer_regexps)')
+        print('def lex(stream):')
+        print('    return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES)')
+
+
+class GetRule:
+    def __init__(self, rule_id):
+        self.rule_id = rule_id
+
+    def __repr__(self):
+        return 'RULE_ID[%d]' % self.rule_id
+
+
+def get_rule_ids(x):
+    if isinstance(x, (tuple, list)):
+        return type(x)(map(get_rule_ids, x))
+    elif isinstance(x, dict):
+        return {get_rule_ids(k):get_rule_ids(v) for k, v in x.items()}
+    elif isinstance(x, Rule):
+        return GetRule(id(x))
+    return x
+
+class ParserAtoms:
+    def __init__(self, parser):
+        self.parse_table = parser.analysis.parse_table
+
+    def print_python(self):
+        print('class ParseTable: pass')
+        print('parse_table = ParseTable()')
+        print('parse_table.states = (')
+        pprint(get_rule_ids(self.parse_table.states))
+        print(')')
+        print('parse_table.start_state = %s' % self.parse_table.start_state)
+        print('parse_table.end_state = %s' % self.parse_table.end_state)
+        print('class Lark_StandAlone:')
+        print('  def __init__(self, transformer=None):')
+        print('     callback = parse_tree_builder.create_callback(transformer=transformer)')
+        print('     callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES}')
+        print('     self.parser = _Parser(parse_table, callbacks)')
+        print('  def parse(self, stream):')
+        print('     return self.parser.parse(lex(stream))')
+
+class TreeBuilderAtoms:
+    def __init__(self, lark):
+        self.rules = lark.rules
+        self.ptb = lark._parse_tree_builder
+
+    def print_python(self):
+        print('RULE_ID = {')
+        for r in self.rules:
+            print(' %d: Rule(%r, %r, %r, %r),' % (id(r), r.origin, r.expansion, self.ptb.user_aliases[r], r.options ))
+        print('}')
+        print('RULES = list(RULE_ID.values())')
+        print('parse_tree_builder = ParseTreeBuilder(RULES, Tree)')
+
+def main(fn):
+    with codecs.open(fn, encoding='utf8') as f:
+        lark_inst = Lark(f, parser="lalr")
+
+    lexer_atoms = LexerAtoms(lark_inst.parser.lexer)
+    parser_atoms = ParserAtoms(lark_inst.parser.parser)
+    tree_builder_atoms = TreeBuilderAtoms(lark_inst)
+
+    print('# Generated by Lark v%s' % lark.__version__)
+
+
+    for pyfile in EXTRACT_STANDALONE_FILES:
+        print (extract_sections(open(os.path.join(__larkdir__, pyfile)))['standalone'])
+
+    print(open(os.path.join(__larkdir__, 'grammar.py')).read())
+    print('Shift = 0')
+    print('Reduce = 1')
+    lexer_atoms.print_python()
+    tree_builder_atoms.print_python()
+    parser_atoms.print_python()
+
+    # print('print(parser.parse(lex("1+2")).pretty())')
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print("Generates a stand-alone lalr parser")
+        print("Usage: %s <grammar_path>" % sys.argv[0])
+        sys.exit(1)
+
+    fn ,= sys.argv[1:]
+
+    main(fn)
diff --git a/lark/tree.py b/lark/tree.py
index 1639bb1..9c8e7da 100644
--- a/lark/tree.py
+++ b/lark/tree.py
@@ -101,6 +101,7 @@ class Tree(object):
 
 
 
+###{standalone
 class Transformer(object):
     def _get_func(self, name):
         return getattr(self, name)
@@ -197,6 +198,7 @@ class Transformer_NoRecurse(Transformer):
 
     def __default__(self, t):
         return t
+###}
 
 
 def pydot__tree_to_png(tree, filename):
diff --git a/lark/utils.py b/lark/utils.py
index d984400..01c70a1 100644
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -1,7 +1,4 @@
-import functools
-import types
 from collections import deque
-from contextlib import contextmanager
 
 class fzset(frozenset):
     def __repr__(self):
@@ -49,8 +46,13 @@ try:
 except NameError:   # Python 3
     STRING_TYPE = str
 
-Str = type(u'')
+###{standalone
 
+import types
+import functools
+from contextlib import contextmanager
+
+Str = type(u'')
 
 def inline_args(f):
     # print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType)
@@ -76,19 +78,6 @@ def inline_args(f):
         return _f
 
 
-
-try:
-    compare = cmp
-except NameError:
-    def compare(a, b):
-        if a == b:
-            return 0
-        elif a > b:
-            return 1
-        else:
-            return -1
-
-
 try:
     from contextlib import suppress     # Python 3
 except ImportError:
@@ -107,6 +96,19 @@ except ImportError:
         except excs:
             pass
 
+###}
 
 
 
+try:
+    compare = cmp
+except NameError:
+    def compare(a, b):
+        if a == b:
+            return 0
+        elif a > b:
+            return 1
+        else:
+            return -1
+
+

From a409f2835c9715be41c37c9426a4e0afa634556c Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Wed, 10 Jan 2018 00:56:09 +0200
Subject: [PATCH 11/21] Corrections to the standalone tool

---
 lark/tools/standalone.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py
index 54dc69a..45bd18d 100644
--- a/lark/tools/standalone.py
+++ b/lark/tools/standalone.py
@@ -3,6 +3,8 @@
 #
 #   Lark Stand-alone Generator Tool
 # ----------------------------------
+# Generates a stand-alone LALR(1) parser with a standard lexer
+#
 # Git:    https://github.com/erezsh/lark
 # Author: Erez Shinan (erezshin@gmail.com)
 #
@@ -158,8 +160,7 @@ def main(fn):
     parser_atoms = ParserAtoms(lark_inst.parser.parser)
     tree_builder_atoms = TreeBuilderAtoms(lark_inst)
 
-    print('# Generated by Lark v%s' % lark.__version__)
-
+    print('# The file was automatically generated by Lark v%s' % lark.__version__)
 
     for pyfile in EXTRACT_STANDALONE_FILES:
         print (extract_sections(open(os.path.join(__larkdir__, pyfile)))['standalone'])
@@ -171,12 +172,10 @@ def main(fn):
     tree_builder_atoms.print_python()
     parser_atoms.print_python()
 
-    # print('print(parser.parse(lex("1+2")).pretty())')
-
 if __name__ == '__main__':
     if len(sys.argv) < 2:
-        print("Generates a stand-alone lalr parser")
-        print("Usage: %s <grammar_path>" % sys.argv[0])
+        print("Lark Stand-alone Generator Tool")
+        print("Usage: python -m lark.tools.standalone <grammar-file>")
         sys.exit(1)
 
     fn ,= sys.argv[1:]

From 9b0672fda646c6bbe662e4e51d2d5e3bdc700d77 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Wed, 10 Jan 2018 10:28:25 +0200
Subject: [PATCH 12/21] Standalone tools now supports postlex

---
 lark/common.py           |  4 ++--
 lark/indenter.py         |  3 +++
 lark/lexer.py            | 25 ++++++++++++-------------
 lark/tools/standalone.py | 32 +++++++++++++++++++++++---------
 4 files changed, 40 insertions(+), 24 deletions(-)

diff --git a/lark/common.py b/lark/common.py
index ff1897a..1717fe7 100644
--- a/lark/common.py
+++ b/lark/common.py
@@ -15,8 +15,6 @@ class GrammarError(Exception):
 class ParseError(Exception):
     pass
 
-###}
-
 class UnexpectedToken(ParseError):
     def __init__(self, token, expected, seq, index):
         self.token = token
@@ -37,6 +35,8 @@ class UnexpectedToken(ParseError):
         super(UnexpectedToken, self).__init__(message)
 
 
+###}
+
 
 
 class LexerConf:
diff --git a/lark/indenter.py b/lark/indenter.py
index a5f107d..34e61a0 100644
--- a/lark/indenter.py
+++ b/lark/indenter.py
@@ -2,6 +2,7 @@
 
 from .lexer import Token
 
+###{standalone
 class Indenter:
     def __init__(self):
         self.paren_level = 0
@@ -50,3 +51,5 @@ class Indenter:
     @property
     def always_accept(self):
         return (self.NL_type,)
+
+###}
diff --git a/lark/lexer.py b/lark/lexer.py
index 4f673f6..844025d 100644
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -101,25 +101,23 @@ class _Lex:
                 if line_ctr.char_pos < len(stream):
                     raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
                 break
-###}
-
 
-def _regexp_has_newline(r):
-    return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)
+class UnlessCallback:
+    def __init__(self, mres):
+        self.mres = mres
 
-def _create_unless_callback(strs):
-    mres = build_mres(strs, match_whole=True)
-    def unless_callback(t):
-        # if t in strs:
-        #     t.type = strs[t]
-        for mre, type_from_index in mres:
+    def __call__(self, t):
+        for mre, type_from_index in self.mres:
             m = mre.match(t.value)
             if m:
                 value = m.group(0)
                 t.type = type_from_index[m.lastindex]
                 break
         return t
-    return unless_callback
+
+###}
+
+
 
 def _create_unless(tokens):
     tokens_by_type = classify(tokens, lambda t: type(t.pattern))
@@ -136,7 +134,7 @@ def _create_unless(tokens):
                 if strtok.pattern.flags <= retok.pattern.flags:
                     embedded_strs.add(strtok)
         if unless:
-            callback[retok.name] = _create_unless_callback(unless)
+            callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True))
 
     tokens = [t for t in tokens if t not in embedded_strs]
     return tokens, callback
@@ -161,7 +159,8 @@ def _build_mres(tokens, max_size, match_whole):
 def build_mres(tokens, match_whole=False):
     return _build_mres(tokens, len(tokens), match_whole)
 
-
+def _regexp_has_newline(r):
+    return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r)
 
 class Lexer:
     def __init__(self, tokens, ignore=()):
diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py
index 45bd18d..7a9f5a2 100644
--- a/lark/tools/standalone.py
+++ b/lark/tools/standalone.py
@@ -54,6 +54,7 @@ EXTRACT_STANDALONE_FILES = [
     'utils.py',
     'common.py',
     'tree.py',
+    'indenter.py',
     'lexer.py',
     'parse_tree_builder.py',
     'parsers/lalr_parser.py',
@@ -81,22 +82,27 @@ def extract_sections(lines):
 
 class LexerAtoms:
     def __init__(self, lexer):
-        assert not lexer.callback
         self.mres = [(p.pattern,d) for p,d in lexer.mres]
         self.newline_types = lexer.newline_types
         self.ignore_types = lexer.ignore_types
+        self.callback = {name:[(p.pattern,d) for p,d in c.mres]
+                         for name, c in lexer.callback.items()}
 
     def print_python(self):
         print('import re')
         print('MRES = (')
         pprint(self.mres)
         print(')')
+        print('LEXER_CALLBACK = (')
+        pprint(self.callback)
+        print(')')
         print('NEWLINE_TYPES = %s' % self.newline_types)
         print('IGNORE_TYPES = %s' % self.ignore_types)
         print('class LexerRegexps: pass')
         print('lexer_regexps = LexerRegexps()')
         print('lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES]')
-        print('lexer_regexps.callback = {}')
+        print('lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres])')
+        print('                          for n, mres in LEXER_CALLBACK.items()}')
         print('lexer = _Lex(lexer_regexps)')
         print('def lex(stream):')
         print('    return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES)')
@@ -132,12 +138,15 @@ class ParserAtoms:
         print('parse_table.start_state = %s' % self.parse_table.start_state)
         print('parse_table.end_state = %s' % self.parse_table.end_state)
         print('class Lark_StandAlone:')
-        print('  def __init__(self, transformer=None):')
+        print('  def __init__(self, transformer=None, postlex=None):')
         print('     callback = parse_tree_builder.create_callback(transformer=transformer)')
         print('     callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES}')
         print('     self.parser = _Parser(parse_table, callbacks)')
+        print('     self.postlex = postlex')
         print('  def parse(self, stream):')
-        print('     return self.parser.parse(lex(stream))')
+        print('     tokens = lex(stream)')
+        print('     if self.postlex: tokens = self.postlex.process(tokens)')
+        print('     return self.parser.parse(tokens)')
 
 class TreeBuilderAtoms:
     def __init__(self, lark):
@@ -152,9 +161,9 @@ class TreeBuilderAtoms:
         print('RULES = list(RULE_ID.values())')
         print('parse_tree_builder = ParseTreeBuilder(RULES, Tree)')
 
-def main(fn):
+def main(fn, start):
     with codecs.open(fn, encoding='utf8') as f:
-        lark_inst = Lark(f, parser="lalr")
+        lark_inst = Lark(f, parser="lalr", start=start)
 
     lexer_atoms = LexerAtoms(lark_inst.parser.lexer)
     parser_atoms = ParserAtoms(lark_inst.parser.parser)
@@ -175,9 +184,14 @@ def main(fn):
 if __name__ == '__main__':
     if len(sys.argv) < 2:
         print("Lark Stand-alone Generator Tool")
-        print("Usage: python -m lark.tools.standalone <grammar-file>")
+        print("Usage: python -m lark.tools.standalone <grammar-file> [<start>]")
         sys.exit(1)
 
-    fn ,= sys.argv[1:]
+    if len(sys.argv) == 3:
+        fn, start = sys.argv[1:]
+    elif len(sys.argv) == 2:
+        fn, start = sys.argv[1], 'start'
+    else:
+        assert False, sys.argv
 
-    main(fn)
+    main(fn, start)

From e697c266a7cab63ba79e5bc9eb547e59e05f118e Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Wed, 10 Jan 2018 13:11:15 +0200
Subject: [PATCH 13/21] Standalone: Significantly reduced generated code size

---
 lark/tools/standalone.py | 40 +++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py
index 7a9f5a2..0444614 100644
--- a/lark/tools/standalone.py
+++ b/lark/tools/standalone.py
@@ -42,6 +42,7 @@ from collections import defaultdict
 
 import lark
 from lark import Lark
+from lark.parsers.lalr_analysis import Shift, Reduce
 
 from ..grammar import Rule
 
@@ -113,17 +114,15 @@ class GetRule:
         self.rule_id = rule_id
 
     def __repr__(self):
-        return 'RULE_ID[%d]' % self.rule_id
+        return 'RULES[%d]' % self.rule_id
 
+rule_ids = {}
+token_types = {}
 
-def get_rule_ids(x):
-    if isinstance(x, (tuple, list)):
-        return type(x)(map(get_rule_ids, x))
-    elif isinstance(x, dict):
-        return {get_rule_ids(k):get_rule_ids(v) for k, v in x.items()}
-    elif isinstance(x, Rule):
-        return GetRule(id(x))
-    return x
+def _get_token_type(token_type):
+    if token_type not in token_types:
+        token_types[token_type] = len(token_types)
+    return token_types[token_type]
 
 class ParserAtoms:
     def __init__(self, parser):
@@ -132,15 +131,22 @@ class ParserAtoms:
     def print_python(self):
         print('class ParseTable: pass')
         print('parse_table = ParseTable()')
-        print('parse_table.states = (')
-        pprint(get_rule_ids(self.parse_table.states))
+        print('STATES = {')
+        for state, actions in self.parse_table.states.items():
+            print('  %r: %r,' % (state, {_get_token_type(token): ((1, rule_ids[arg]) if action is Reduce else (0, arg))
+                            for token, (action, arg) in actions.items()}))
+        print('}')
+        print('TOKEN_TYPES = (')
+        pprint({v:k for k, v in token_types.items()})
         print(')')
+        print('parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()}')
+        print('                      for s, acts in STATES.items()}')
         print('parse_table.start_state = %s' % self.parse_table.start_state)
         print('parse_table.end_state = %s' % self.parse_table.end_state)
         print('class Lark_StandAlone:')
         print('  def __init__(self, transformer=None, postlex=None):')
         print('     callback = parse_tree_builder.create_callback(transformer=transformer)')
-        print('     callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES}')
+        print('     callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES.values()}')
         print('     self.parser = _Parser(parse_table, callbacks)')
         print('     self.postlex = postlex')
         print('  def parse(self, stream):')
@@ -154,12 +160,12 @@ class TreeBuilderAtoms:
         self.ptb = lark._parse_tree_builder
 
     def print_python(self):
-        print('RULE_ID = {')
-        for r in self.rules:
-            print(' %d: Rule(%r, %r, %r, %r),' % (id(r), r.origin, r.expansion, self.ptb.user_aliases[r], r.options ))
+        print('RULES = {')
+        for i, r in enumerate(self.rules):
+            rule_ids[r] = i
+            print('  %d: Rule(%r, %r, %r, %r),' % (i, r.origin, r.expansion, self.ptb.user_aliases[r], r.options ))
         print('}')
-        print('RULES = list(RULE_ID.values())')
-        print('parse_tree_builder = ParseTreeBuilder(RULES, Tree)')
+        print('parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree)')
 
 def main(fn, start):
     with codecs.open(fn, encoding='utf8') as f:

From f9b02c1f13d9ffe462d0d646dbcec51d7e6b68dc Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Wed, 10 Jan 2018 14:45:56 +0200
Subject: [PATCH 14/21] Updated README to mention standalone

---
 README.md | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 57ac62c..794a203 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@ Lark can:
  - Build a parse-tree automagically, no construction code required
  - Outperform all other Python libraries when using LALR(1) (Yes, including PLY)
  - Run on every Python interpreter (it's pure-python)
+ - Generate a stand-alone parser (for LALR(1) grammars)
 
 And many more features. Read ahead and find out.
 
@@ -69,7 +70,8 @@ See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples)
     - Can parse all context-free grammars
     - Full support for ambiguous grammars
  - **LALR(1)** parser
-    - Competitive with PLY
+    - Fast and light, competitive with PLY
+    - Can generate a stand-alone parser
  - **EBNF** grammar
  - **Unicode** fully supported
  - **Python 2 & 3** compatible
@@ -99,17 +101,17 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail
 
 #### Feature comparison
 
-| Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG? | Line/Column tracking |
-|:--------|:----------|:----|:--------|:------------|:------------|:----------
-| **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | Yes! |
-| [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | No |
-| [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | No |
-| [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | No |
-| [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No | No |
-| [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | No |
+| Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG? | Line/Column tracking | Generates Stand-alone
+|:--------|:----------|:----|:--------|:------------|:------------|:----------|:----------
+| **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | Yes! | Yes! (LALR only) |
+| [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | No | No |
+| [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | No | No |
+| [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | No | No |
+| [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No | No | No |
+| [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | No | No |
 
 
-(\* *According to Wikipedia, it remains unanswered whether PEGs can really parse all deterministic CFGs*)
+(\* *PEGs cannot handle non-deterministic grammars. Also, according to Wikipedia, it remains unanswered whether PEGs can really parse all deterministic CFGs*)
 
 
 ### Projects using Lark

From 4679a348cea97f633a486a5b14cc32ba59c72f2e Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Wed, 10 Jan 2018 14:46:25 +0200
Subject: [PATCH 15/21] Version bump

---
 lark/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lark/__init__.py b/lark/__init__.py
index 930fa01..1637a75 100644
--- a/lark/__init__.py
+++ b/lark/__init__.py
@@ -4,4 +4,4 @@ from .lexer import UnexpectedInput, LexError
 from .lark import Lark
 from .utils import inline_args
 
-__version__ = "0.5.1"
+__version__ = "0.5.2"

From 4d219ae837aaf15c6d1c533358683e30abf837c1 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Thu, 11 Jan 2018 16:02:02 +0200
Subject: [PATCH 16/21] Added standalone example

---
 examples/standalone/create_standalone.sh |   1 +
 examples/standalone/json.g               |  21 +
 examples/standalone/json_parser.py       | 794 +++++++++++++++++++++++
 examples/standalone/json_parser_main.py  |  25 +
 4 files changed, 841 insertions(+)
 create mode 100755 examples/standalone/create_standalone.sh
 create mode 100644 examples/standalone/json.g
 create mode 100644 examples/standalone/json_parser.py
 create mode 100644 examples/standalone/json_parser_main.py

diff --git a/examples/standalone/create_standalone.sh b/examples/standalone/create_standalone.sh
new file mode 100755
index 0000000..1eba3a4
--- /dev/null
+++ b/examples/standalone/create_standalone.sh
@@ -0,0 +1 @@
+python -m lark.tools.standalone json.g > json_parser.py
diff --git a/examples/standalone/json.g b/examples/standalone/json.g
new file mode 100644
index 0000000..243a230
--- /dev/null
+++ b/examples/standalone/json.g
@@ -0,0 +1,21 @@
+?start: value
+
+?value: object
+        | array
+        | string
+        | SIGNED_NUMBER      -> number
+        | "true"             -> true
+        | "false"            -> false
+        | "null"             -> null
+
+array  : "[" [value ("," value)*] "]"
+object : "{" [pair ("," pair)*] "}"
+pair   : string ":" value
+
+string : ESCAPED_STRING
+
+%import common.ESCAPED_STRING
+%import common.SIGNED_NUMBER
+%import common.WS
+
+%ignore WS
diff --git a/examples/standalone/json_parser.py b/examples/standalone/json_parser.py
new file mode 100644
index 0000000..f249f61
--- /dev/null
+++ b/examples/standalone/json_parser.py
@@ -0,0 +1,794 @@
+# The file was automatically generated by Lark v0.5.2
+#
+#
+#   Lark Stand-alone Generator Tool
+# ----------------------------------
+# Generates a stand-alone LALR(1) parser with a standard lexer
+#
+# Git:    https://github.com/erezsh/lark
+# Author: Erez Shinan (erezshin@gmail.com)
+#
+#
+#    >>> LICENSE
+#
+#    This tool and its generated code use a separate license from Lark.
+#
+#    It is licensed under GPLv2 or above.
+#
+#    If you wish to purchase a commercial license for this tool and its
+#    generated code, contact me via email.
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 2 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    See <http://www.gnu.org/licenses/>.
+#
+#
+
+
+import types
+import functools
+from contextlib import contextmanager
+
+Str = type(u'')
+
+def inline_args(f):
+    # print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType)
+    if isinstance(f, types.FunctionType):
+        @functools.wraps(f)
+        def _f_func(self, args):
+            return f(self, *args)
+        return _f_func
+    elif isinstance(f, (type, types.BuiltinFunctionType)):
+        @functools.wraps(f)
+        def _f_builtin(_self, args):
+            return f(*args)
+        return _f_builtin
+    elif isinstance(f, types.MethodType):
+        @functools.wraps(f.__func__)
+        def _f(self, args):
+            return f.__func__(self, *args)
+        return _f
+    else:
+        @functools.wraps(f.__call__.__func__)
+        def _f(self, args):
+            return f.__call__.__func__(self, *args)
+        return _f
+
+
+try:
+    from contextlib import suppress     # Python 3
+except ImportError:
+    @contextmanager
+    def suppress(*excs):
+        '''Catch and dismiss the provided exception
+
+        >>> x = 'hello'
+        >>> with suppress(IndexError):
+        ...     x = x[10]
+        >>> x
+        'hello'
+        '''
+        try:
+            yield
+        except excs:
+            pass
+
+
+def is_terminal(sym):
+    return sym.isupper()
+
+class GrammarError(Exception):
+    pass
+
+class ParseError(Exception):
+    pass
+
+class UnexpectedToken(ParseError):
+    def __init__(self, token, expected, seq, index):
+        self.token = token
+        self.expected = expected
+        self.line = getattr(token, 'line', '?')
+        self.column = getattr(token, 'column', '?')
+
+        try:
+            context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]])
+        except AttributeError:
+            context = seq[index:index+5]
+        except TypeError:
+            context = "<no context>"
+        message = ("Unexpected token %r at line %s, column %s.\n"
+                   "Expected: %s\n"
+                   "Context: %s" % (token, self.line, self.column, expected, context))
+
+        super(UnexpectedToken, self).__init__(message)
+
+
+
+class Tree(object):
+    def __init__(self, data, children):
+        self.data = data
+        self.children = list(children)
+
+    def __repr__(self):
+        return 'Tree(%s, %s)' % (self.data, self.children)
+
+    def _pretty_label(self):
+        return self.data
+
+    def _pretty(self, level, indent_str):
+        if len(self.children) == 1 and not isinstance(self.children[0], Tree):
+            return [ indent_str*level, self._pretty_label(), '\t', '%s' % self.children[0], '\n']
+
+        l = [ indent_str*level, self._pretty_label(), '\n' ]
+        for n in self.children:
+            if isinstance(n, Tree):
+                l += n._pretty(level+1, indent_str)
+            else:
+                l += [ indent_str*(level+1), '%s' % n, '\n' ]
+
+        return l
+
+    def pretty(self, indent_str='  '):
+        return ''.join(self._pretty(0, indent_str))
+class Transformer(object):
+    def _get_func(self, name):
+        return getattr(self, name)
+
+    def transform(self, tree):
+        items = []
+        for c in tree.children:
+            try:
+                items.append(self.transform(c) if isinstance(c, Tree) else c)
+            except Discard:
+                pass
+        try:
+            f = self._get_func(tree.data)
+        except AttributeError:
+            return self.__default__(tree.data, items)
+        else:
+            return f(items)
+
+    def __default__(self, data, children):
+        return Tree(data, children)
+
+    def __mul__(self, other):
+        return TransformerChain(self, other)
+
+
+class Discard(Exception):
+    pass
+
+class TransformerChain(object):
+    def __init__(self, *transformers):
+        self.transformers = transformers
+
+    def transform(self, tree):
+        for t in self.transformers:
+            tree = t.transform(tree)
+        return tree
+
+    def __mul__(self, other):
+        return TransformerChain(*self.transformers + (other,))
+
+
+
+class InlineTransformer(Transformer):
+    def _get_func(self, name):  # use super()._get_func
+        return inline_args(getattr(self, name)).__get__(self)
+
+
+class Visitor(object):
+    def visit(self, tree):
+        for child in tree.children:
+            if isinstance(child, Tree):
+                self.visit(child)
+
+        f = getattr(self, tree.data, self.__default__)
+        f(tree)
+        return tree
+
+    def __default__(self, tree):
+        pass
+
+
+class Visitor_NoRecurse(Visitor):
+    def visit(self, tree):
+        subtrees = list(tree.iter_subtrees())
+
+        for subtree in (subtrees):
+            getattr(self, subtree.data, self.__default__)(subtree)
+        return tree
+
+
+class Transformer_NoRecurse(Transformer):
+    def transform(self, tree):
+        subtrees = list(tree.iter_subtrees())
+
+        def _t(t):
+            # Assumes t is already transformed
+            try:
+                f = self._get_func(t.data)
+            except AttributeError:
+                return self.__default__(t)
+            else:
+                return f(t)
+
+        for subtree in subtrees:
+            children = []
+            for c in subtree.children:
+                try:
+                    children.append(_t(c) if isinstance(c, Tree) else c)
+                except Discard:
+                    pass
+            subtree.children = children
+
+        return _t(tree)
+
+    def __default__(self, t):
+        return t
+
+class Indenter:
+    def __init__(self):
+        self.paren_level = 0
+        self.indent_level = [0]
+
+    def handle_NL(self, token):
+        if self.paren_level > 0:
+            return
+
+        yield token
+
+        indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces
+        indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len
+
+        if indent > self.indent_level[-1]:
+            self.indent_level.append(indent)
+            yield Token.new_borrow_pos(self.INDENT_type, indent_str, token)
+        else:
+            while indent < self.indent_level[-1]:
+                self.indent_level.pop()
+                yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token)
+
+            assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1])
+
+    def process(self, stream):
+        for token in stream:
+            if token.type == self.NL_type:
+                for t in self.handle_NL(token):
+                    yield t
+            else:
+                yield token
+
+            if token.type in self.OPEN_PAREN_types:
+                self.paren_level += 1
+            elif token.type in self.CLOSE_PAREN_types:
+                self.paren_level -= 1
+                assert self.paren_level >= 0
+
+        while len(self.indent_level) > 1:
+            self.indent_level.pop()
+            yield Token(self.DEDENT_type, '')
+
+        assert self.indent_level == [0], self.indent_level
+
+    # XXX Hack for ContextualLexer. Maybe there's a more elegant solution?
+    @property
+    def always_accept(self):
+        return (self.NL_type,)
+
+
+class LexError(Exception):
+    pass
+
+class UnexpectedInput(LexError):
+    def __init__(self, seq, lex_pos, line, column, allowed=None):
+        context = seq[lex_pos:lex_pos+5]
+        message = "No token defined for: '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column)
+
+        super(UnexpectedInput, self).__init__(message)
+
+        self.line = line
+        self.column = column
+        self.context = context
+        self.allowed = allowed
+
+class Token(Str):
+    def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None):
+        inst = Str.__new__(cls, value)
+        inst.type = type_
+        inst.pos_in_stream = pos_in_stream
+        inst.value = value
+        inst.line = line
+        inst.column = column
+        return inst
+
+    @classmethod
+    def new_borrow_pos(cls, type_, value, borrow_t):
+        return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column)
+
+    def __repr__(self):
+        return 'Token(%s, %r)' % (self.type, self.value)
+
+    def __deepcopy__(self, memo):
+        return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)
+
+    def __eq__(self, other):
+        if isinstance(other, Token) and self.type != other.type:
+            return False
+
+        return Str.__eq__(self, other)
+
+    __hash__ = Str.__hash__
+
+
+class LineCounter:
+    def __init__(self):
+        self.newline_char = '\n'
+        self.char_pos = 0
+        self.line = 1
+        self.column = 0
+        self.line_start_pos = 0
+
+    def feed(self, token, test_newline=True):
+        """Consume a token and calculate the new line & column.
+
+        As an optional optimization, set test_newline=False is token doesn't contain a newline.
+        """
+        if test_newline:
+            newlines = token.count(self.newline_char)
+            if newlines:
+                self.line += newlines
+                self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1
+
+        self.char_pos += len(token)
+        self.column = self.char_pos - self.line_start_pos
+
+class _Lex:
+    "Built to serve both Lexer and ContextualLexer"
+    def __init__(self, lexer):
+        self.lexer = lexer
+
+    def lex(self, stream, newline_types, ignore_types):
+        newline_types = list(newline_types)
+        newline_types = list(newline_types)
+        line_ctr = LineCounter()
+
+        while True:
+            lexer = self.lexer
+            for mre, type_from_index in lexer.mres:
+                m = mre.match(stream, line_ctr.char_pos)
+                if m:
+                    value = m.group(0)
+                    type_ = type_from_index[m.lastindex]
+                    if type_ not in ignore_types:
+                        t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
+                        if t.type in lexer.callback:
+                            t = lexer.callback[t.type](t)
+                        lexer = yield t
+
+                    line_ctr.feed(value, type_ in newline_types)
+                    break
+            else:
+                if line_ctr.char_pos < len(stream):
+                    raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
+                break
+
+class UnlessCallback:
+    def __init__(self, mres):
+        self.mres = mres
+
+    def __call__(self, t):
+        for mre, type_from_index in self.mres:
+            m = mre.match(t.value)
+            if m:
+                value = m.group(0)
+                t.type = type_from_index[m.lastindex]
+                break
+        return t
+
+
+
+class NodeBuilder:
+    def __init__(self, tree_class, name):
+        self.tree_class = tree_class
+        self.name = name
+
+    def __call__(self, children):
+        return self.tree_class(self.name, children)
+
+class Expand1:
+    def __init__(self, node_builder):
+        self.node_builder = node_builder
+
+    def __call__(self, children):
+        if len(children) == 1:
+            return children[0]
+        else:
+            return self.node_builder(children)
+
+class Factory:
+    def __init__(self, cls, *args):
+        self.cls = cls
+        self.args = args
+
+    def __call__(self, node_builder):
+        return self.cls(node_builder, *self.args)
+
+
+class TokenWrapper:
+    "Used for fixing the results of scanless parsing"
+
+    def __init__(self, node_builder, token_name):
+        self.node_builder = node_builder
+        self.token_name = token_name
+
+    def __call__(self, children):
+        return self.node_builder( [Token(self.token_name, ''.join(children))] )
+
+def identity(node_builder):
+    return node_builder
+
+
+class ChildFilter:
+    def __init__(self, node_builder, to_include):
+        self.node_builder = node_builder
+        self.to_include = to_include
+
+    def __call__(self, children):
+        filtered = []
+        for i, to_expand in self.to_include:
+            if to_expand:
+                filtered += children[i].children
+            else:
+                filtered.append(children[i])
+
+        return self.node_builder(filtered)
+
+def create_rule_handler(expansion, keep_all_tokens, filter_out):
+    # if not keep_all_tokens:
+    to_include = [(i, not is_terminal(sym) and sym.startswith('_'))
+                  for i, sym in enumerate(expansion)
+                  if keep_all_tokens
+                  or not ((is_terminal(sym) and sym.startswith('_')) or sym in filter_out)
+                  ]
+
+    if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include):
+        return Factory(ChildFilter, to_include)
+
+    # else, if no filtering required..
+    return identity
+
+class PropagatePositions:
+    def __init__(self, node_builder):
+        self.node_builder = node_builder
+
+    def __call__(self, children):
+        res = self.node_builder(children)
+
+        if children:
+            for a in children:
+                with suppress(AttributeError):
+                    res.line = a.line
+                    res.column = a.column
+                break
+
+            for a in reversed(children):
+                with suppress(AttributeError):
+                    res.end_line = a.end_line
+                    res.end_col = a.end_col
+                break
+
+        return res
+
+
+class Callback(object):
+    pass
+
+class ParseTreeBuilder:
+    def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False):
+        self.tree_class = tree_class
+        self.propagate_positions = propagate_positions
+        self.always_keep_all_tokens = keep_all_tokens
+
+        self.rule_builders = list(self._init_builders(rules))
+
+        self.user_aliases = {}
+
+    def _init_builders(self, rules):
+        filter_out = set()
+        for rule in rules:
+            if rule.options and rule.options.filter_out:
+                assert rule.origin.startswith('_')   # Just to make sure
+                filter_out.add(rule.origin)
+
+        for rule in rules:
+            options = rule.options
+            keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False)
+            expand1 = options.expand1 if options else False
+            create_token = options.create_token if options else False
+
+            wrapper_chain = filter(None, [
+                (expand1 and not rule.alias) and Expand1,
+                create_token and Factory(TokenWrapper, create_token),
+                create_rule_handler(rule.expansion, keep_all_tokens, filter_out),
+                self.propagate_positions and PropagatePositions,
+            ])
+
+            yield rule, wrapper_chain
+
+
+    def create_callback(self, transformer=None):
+        callback = Callback()
+
+        for rule, wrapper_chain in self.rule_builders:
+            internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion))
+
+            user_callback_name = rule.alias or rule.origin
+            try:
+                f = transformer._get_func(user_callback_name)
+            except AttributeError:
+                f = NodeBuilder(self.tree_class, user_callback_name)
+
+            self.user_aliases[rule] = rule.alias
+            rule.alias = internal_callback_name
+
+            for w in wrapper_chain:
+                f = w(f)
+
+            if hasattr(callback, internal_callback_name):
+                raise GrammarError("Rule '%s' already exists" % (rule,))
+            setattr(callback, internal_callback_name, f)
+
+        return callback
+
+
+
+class _Parser:
+    def __init__(self, parse_table, callbacks):
+        self.states = parse_table.states
+        self.start_state = parse_table.start_state
+        self.end_state = parse_table.end_state
+        self.callbacks = callbacks
+
+    def parse(self, seq, set_state=None):
+        i = 0
+        token = None
+        stream = iter(seq)
+        states = self.states
+
+        state_stack = [self.start_state]
+        value_stack = []
+
+        if set_state: set_state(self.start_state)
+
+        def get_action(key):
+            state = state_stack[-1]
+            try:
+                return states[state][key]
+            except KeyError:
+                expected = states[state].keys()
+
+                raise UnexpectedToken(token, expected, seq, i)
+
+        def reduce(rule):
+            size = len(rule.expansion)
+            if size:
+                s = value_stack[-size:]
+                del state_stack[-size:]
+                del value_stack[-size:]
+            else:
+                s = []
+
+            value = self.callbacks[rule](s)
+
+            _action, new_state = get_action(rule.origin)
+            assert _action is Shift
+            state_stack.append(new_state)
+            value_stack.append(value)
+
+        # Main LALR-parser loop
+        try:
+            token = next(stream)
+            i += 1
+            while True:
+                action, arg = get_action(token.type)
+                assert arg != self.end_state
+
+                if action is Shift:
+                    state_stack.append(arg)
+                    value_stack.append(token)
+                    if set_state: set_state(arg)
+                    token = next(stream)
+                    i += 1
+                else:
+                    reduce(arg)
+        except StopIteration:
+            pass
+
+        while True:
+            _action, arg = get_action('$END')
+            if _action is Shift:
+                assert arg == self.end_state
+                val ,= value_stack
+                return val
+            else:
+                reduce(arg)
+
+
+
+class Rule(object):
+    """
+        origin : a symbol
+        expansion : a list of symbols
+    """
+    def __init__(self, origin, expansion, alias=None, options=None):
+        self.origin = origin
+        self.expansion = expansion
+        self.alias = alias
+        self.options = options
+
+    def __str__(self):
+        return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion)))
+
+    def __repr__(self):
+        return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options)
+
+
+class RuleOptions:
+    def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None):
+        self.keep_all_tokens = keep_all_tokens
+        self.expand1 = expand1
+        self.create_token = create_token  # used for scanless postprocessing
+        self.priority = priority
+
+        self.filter_out = filter_out        # remove this rule from the tree
+                                            # used for "token"-rules in scanless
+
+    def __repr__(self):
+        return 'RuleOptions(%r, %r, %r, %r, %r)' % (
+            self.keep_all_tokens,
+            self.expand1,
+            self.create_token,
+            self.priority,
+            self.filter_out
+        )
+
+Shift = 0
+Reduce = 1
+import re
+MRES = (
+[('(?P<SIGNED_NUMBER>(?:(?:\\+|\\-))?(?:(?:(?:[0-9])+(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+|(?:(?:[0-9])+\\.(?:(?:[0-9])+)?|\\.(?:[0-9])+)(?:(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+)?)|(?:[0-9])+))|(?P<ESCAPED_STRING>\\"(?:(?:\\\\\\"|[^"]))*\\")|(?P<WS>(?:[ \t\x0c'
+  '\r\n'
+  '])+)|(?P<__FALSE1>false)|(?P<__NULL2>null)|(?P<__TRUE0>true)|(?P<__COLON>\\:)|(?P<__COMMA>\\,)|(?P<__LBRACE>\\{)|(?P<__LSQB>\\[)|(?P<__RBRACE>\\})|(?P<__RSQB>\\])',
+  {1: 'SIGNED_NUMBER',
+   2: 'ESCAPED_STRING',
+   3: 'WS',
+   4: '__FALSE1',
+   5: '__NULL2',
+   6: '__TRUE0',
+   7: '__COLON',
+   8: '__COMMA',
+   9: '__LBRACE',
+   10: '__LSQB',
+   11: '__RBRACE',
+   12: '__RSQB'})]
+)
+LEXER_CALLBACK = (
+{}
+)
+NEWLINE_TYPES = ['WS']
+IGNORE_TYPES = ['WS']
+class LexerRegexps: pass
+lexer_regexps = LexerRegexps()
+lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES]
+lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres])
+                          for n, mres in LEXER_CALLBACK.items()}
+lexer = _Lex(lexer_regexps)
+def lex(stream):
+    return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES)
+RULES = {
+  0: Rule('start', ['value'], None, RuleOptions(False, True, None, None, False)),
+  1: Rule('value', ['object'], None, RuleOptions(False, True, None, None, False)),
+  2: Rule('value', ['array'], None, RuleOptions(False, True, None, None, False)),
+  3: Rule('value', ['string'], None, RuleOptions(False, True, None, None, False)),
+  4: Rule('value', ['SIGNED_NUMBER'], 'number', RuleOptions(False, True, None, None, False)),
+  5: Rule('value', ['__TRUE0'], 'true', RuleOptions(False, True, None, None, False)),
+  6: Rule('value', ['__FALSE1'], 'false', RuleOptions(False, True, None, None, False)),
+  7: Rule('value', ['__NULL2'], 'null', RuleOptions(False, True, None, None, False)),
+  8: Rule('array', ['__LSQB', 'value', '__anon_star_0', '__RSQB'], None, RuleOptions(False, False, None, None, False)),
+  9: Rule('array', ['__LSQB', 'value', '__RSQB'], None, RuleOptions(False, False, None, None, False)),
+  10: Rule('array', ['__LSQB', '__RSQB'], None, RuleOptions(False, False, None, None, False)),
+  11: Rule('object', ['__LBRACE', 'pair', '__anon_star_1', '__RBRACE'], None, RuleOptions(False, False, None, None, False)),
+  12: Rule('object', ['__LBRACE', 'pair', '__RBRACE'], None, RuleOptions(False, False, None, None, False)),
+  13: Rule('object', ['__LBRACE', '__RBRACE'], None, RuleOptions(False, False, None, None, False)),
+  14: Rule('pair', ['string', '__COLON', 'value'], None, RuleOptions(False, False, None, None, False)),
+  15: Rule('string', ['ESCAPED_STRING'], None, RuleOptions(False, False, None, None, False)),
+  16: Rule('__anon_star_0', ['__COMMA', 'value'], None, None),
+  17: Rule('__anon_star_0', ['__anon_star_0', '__COMMA', 'value'], None, None),
+  18: Rule('__anon_star_1', ['__COMMA', 'pair'], None, None),
+  19: Rule('__anon_star_1', ['__anon_star_1', '__COMMA', 'pair'], None, None),
+}
+parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree)
+class ParseTable: pass
+parse_table = ParseTable()
+STATES = {
+  0: {0: (0, 1), 1: (0, 2), 2: (0, 3), 3: (0, 4), 4: (0, 5), 5: (0, 6), 6: (0, 7), 7: (0, 8), 8: (0, 9), 9: (0, 10), 10: (0, 11), 11: (0, 12)},
+  1: {12: (1, 5), 13: (1, 5), 14: (1, 5), 15: (1, 5)},
+  2: {9: (0, 10), 14: (0, 13), 16: (0, 14), 11: (0, 15)},
+  3: {12: (1, 2), 13: (1, 2), 14: (1, 2), 15: (1, 2)},
+  4: {12: (1, 1), 13: (1, 1), 14: (1, 1), 15: (1, 1)},
+  5: {12: (0, 16)},
+  6: {7: (0, 17), 0: (0, 1), 1: (0, 2), 2: (0, 3), 3: (0, 4), 5: (0, 6), 6: (0, 7), 8: (0, 9), 9: (0, 10), 15: (0, 18), 10: (0, 11), 11: (0, 12)},
+  7: {12: (1, 4), 13: (1, 4), 14: (1, 4), 15: (1, 4)},
+  8: {12: (1, 0)},
+  9: {12: (1, 7), 13: (1, 7), 14: (1, 7), 15: (1, 7)},
+  10: {12: (1, 15), 17: (1, 15), 13: (1, 15), 14: (1, 15), 15: (1, 15)},
+  11: {12: (1, 6), 13: (1, 6), 14: (1, 6), 15: (1, 6)},
+  12: {12: (1, 3), 13: (1, 3), 14: (1, 3), 15: (1, 3)},
+  13: {13: (1, 13), 12: (1, 13), 14: (1, 13), 15: (1, 13)},
+  14: {14: (0, 19), 13: (0, 20), 18: (0, 21)},
+  15: {17: (0, 22)},
+  16: {},
+  17: {19: (0, 23), 15: (0, 24), 13: (0, 25)},
+  18: {13: (1, 10), 12: (1, 10), 14: (1, 10), 15: (1, 10)},
+  19: {13: (1, 12), 12: (1, 12), 14: (1, 12), 15: (1, 12)},
+  20: {9: (0, 10), 11: (0, 15), 16: (0, 26)},
+  21: {14: (0, 27), 13: (0, 28)},
+  22: {5: (0, 6), 1: (0, 2), 0: (0, 1), 8: (0, 9), 2: (0, 3), 3: (0, 4), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12), 7: (0, 29)},
+  23: {15: (0, 30), 13: (0, 31)},
+  24: {13: (1, 9), 12: (1, 9), 14: (1, 9), 15: (1, 9)},
+  25: {5: (0, 6), 1: (0, 2), 0: (0, 1), 8: (0, 9), 2: (0, 3), 3: (0, 4), 7: (0, 32), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12)},
+  26: {13: (1, 18), 14: (1, 18)},
+  27: {13: (1, 11), 12: (1, 11), 14: (1, 11), 15: (1, 11)},
+  28: {16: (0, 33), 9: (0, 10), 11: (0, 15)},
+  29: {13: (1, 14), 14: (1, 14)},
+  30: {13: (1, 8), 12: (1, 8), 14: (1, 8), 15: (1, 8)},
+  31: {5: (0, 6), 1: (0, 2), 0: (0, 1), 7: (0, 34), 8: (0, 9), 2: (0, 3), 3: (0, 4), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12)},
+  32: {15: (1, 16), 13: (1, 16)},
+  33: {13: (1, 19), 14: (1, 19)},
+  34: {15: (1, 17), 13: (1, 17)},
+}
+TOKEN_TYPES = (
+{0: '__TRUE0',
+ 1: '__LBRACE',
+ 2: 'array',
+ 3: 'object',
+ 4: 'start',
+ 5: '__LSQB',
+ 6: 'SIGNED_NUMBER',
+ 7: 'value',
+ 8: '__NULL2',
+ 9: 'ESCAPED_STRING',
+ 10: '__FALSE1',
+ 11: 'string',
+ 12: '$END',
+ 13: '__COMMA',
+ 14: '__RBRACE',
+ 15: '__RSQB',
+ 16: 'pair',
+ 17: '__COLON',
+ 18: '__anon_star_1',
+ 19: '__anon_star_0'}
+)
+parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()}
+                      for s, acts in STATES.items()}
+parse_table.start_state = 0
+parse_table.end_state = 16
+class Lark_StandAlone:
+  def __init__(self, transformer=None, postlex=None):
+     callback = parse_tree_builder.create_callback(transformer=transformer)
+     callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES.values()}
+     self.parser = _Parser(parse_table, callbacks)
+     self.postlex = postlex
+  def parse(self, stream):
+     tokens = lex(stream)
+     if self.postlex: tokens = self.postlex.process(tokens)
+     return self.parser.parse(tokens)
diff --git a/examples/standalone/json_parser_main.py b/examples/standalone/json_parser_main.py
new file mode 100644
index 0000000..47c1bb1
--- /dev/null
+++ b/examples/standalone/json_parser_main.py
@@ -0,0 +1,25 @@
+import sys
+
+from json_parser import Lark_StandAlone, Transformer, inline_args
+
+class TreeToJson(Transformer):
+    @inline_args
+    def string(self, s):
+        return s[1:-1].replace('\\"', '"')
+
+    array = list
+    pair = tuple
+    object = dict
+    number = inline_args(float)
+
+    null = lambda self, _: None
+    true = lambda self, _: True
+    false = lambda self, _: False
+
+
+parser = Lark_StandAlone(transformer=TreeToJson())
+
+if __name__ == '__main__':
+    with open(sys.argv[1]) as f:
+        print(parser.parse(f.read()))
+

From 8acd77d7ffb546e2aa55c80042a8908d7e7e2fc9 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Tue, 16 Jan 2018 00:51:30 +0200
Subject: [PATCH 17/21] Minor fixes in lexer

---
 lark/grammars/common.g | 1 +
 lark/lexer.py          | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/lark/grammars/common.g b/lark/grammars/common.g
index a54d49d..c38f485 100644
--- a/lark/grammars/common.g
+++ b/lark/grammars/common.g
@@ -12,6 +12,7 @@ DECIMAL: INT "." INT? | "." INT
 // float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/
 _EXP: ("e"|"E") SIGNED_INT
 FLOAT: INT _EXP | DECIMAL _EXP?
+SIGNED_FLOAT: ["+"|"-"] INT
 
 NUMBER: FLOAT | INT
 SIGNED_NUMBER: ["+"|"-"] NUMBER
diff --git a/lark/lexer.py b/lark/lexer.py
index 844025d..64cfb46 100644
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -79,7 +79,7 @@ class _Lex:
 
     def lex(self, stream, newline_types, ignore_types):
         newline_types = list(newline_types)
-        newline_types = list(newline_types)
+        ignore_types = list(ignore_types)
         line_ctr = LineCounter()
 
         while True:
@@ -93,7 +93,7 @@ class _Lex:
                         t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
                         if t.type in lexer.callback:
                             t = lexer.callback[t.type](t)
-                        lexer = yield t
+                        yield t
 
                     line_ctr.feed(value, type_ in newline_types)
                     break

From 5fd331be542c586f96feacacc5163e75a533ac2e Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Tue, 16 Jan 2018 00:52:31 +0200
Subject: [PATCH 18/21] BUGFIX: Internally repetitive rules are now handled
 silently (Issue #60)

---
 lark/load_grammar.py |  2 +-
 tests/test_parser.py | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index 2086591..16dc0d9 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -196,7 +196,7 @@ class SimplifyRule_Visitor(Visitor):
                     tree.data = 'expansions'
                     tree.children = [self.visit(T('expansion', [option if i==j else other
                                                                 for j, other in enumerate(tree.children)]))
-                                     for option in child.children]
+                                     for option in set(child.children)]
                     break
             else:
                 break
diff --git a/tests/test_parser.py b/tests/test_parser.py
index db28834..1c7cfcf 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -796,6 +796,16 @@ def _make_parser_test(LEXER, PARSER):
             self.assertEqual(tree.children, ['a', 'A'])
 
 
+        def test_twice_empty(self):
+            g = """!start: [["A"]]
+                """
+            l = _Lark(g)
+            tree = l.parse('A')
+            self.assertEqual(tree.children, ['A'])
+
+            tree = l.parse('')
+            self.assertEqual(tree.children, [])
+
         def test_undefined_ignore(self):
             g = """!start: "A"
 
@@ -1016,6 +1026,7 @@ def _make_parser_test(LEXER, PARSER):
             self.assertEqual(tree.children, [])
 
 
+
         @unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions")
         def test_regex_escaping(self):
             g = _Lark("start: /[ab]/")

From 37c1c0f65f40473007b0d32a941e684fb1119822 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Wed, 17 Jan 2018 00:05:24 +0200
Subject: [PATCH 19/21] Better error message for bad regexps (Issue #62)

---
 lark/common.py           | 7 ++++---
 lark/parser_frontends.py | 9 ++++++---
 lark/utils.py            | 7 +++++++
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/lark/common.py b/lark/common.py
index 1717fe7..f745018 100644
--- a/lark/common.py
+++ b/lark/common.py
@@ -1,7 +1,8 @@
 import re
-import sre_parse
 import sys
 
+from .utils import get_regexp_width
+
 Py36 = (sys.version_info[:2] >= (3, 6))
 
 
@@ -95,10 +96,10 @@ class PatternRE(Pattern):
 
     @property
     def min_width(self):
-        return sre_parse.parse(self.to_regexp()).getwidth()[0]
+        return get_regexp_width(self.to_regexp())[0]
     @property
     def max_width(self):
-        return sre_parse.parse(self.to_regexp()).getwidth()[1]
+        return get_regexp_width(self.to_regexp())[1]
 
 class TokenDef(object):
     def __init__(self, name, pattern, priority=1):
diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
index e8e7ab8..db6cdcc 100644
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -1,5 +1,5 @@
 import re
-import sre_parse
+from .utils import get_regexp_width
 
 from .lexer import Lexer, ContextualLexer, Token
 
@@ -77,7 +77,7 @@ class Earley_NoLex:
         self.regexps = {}
         for t in lexer_conf.tokens:
             regexp = t.pattern.to_regexp()
-            width = sre_parse.parse(regexp).getwidth()
+            width = get_regexp_width(regexp)
             if width != (1,1):
                 raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width))
             self.regexps[t.name] = re.compile(regexp)
@@ -121,7 +121,10 @@ class XEarley:
         self.regexps = {}
         for t in lexer_conf.tokens:
             regexp = t.pattern.to_regexp()
-            assert sre_parse.parse(regexp).getwidth()
+            try:
+                assert get_regexp_width(regexp)
+            except ValueError:
+                raise ValueError("Bad regexp in token %s: %s" % (t.name, regexp))
             self.regexps[t.name] = re.compile(regexp)
 
     def parse(self, text):
diff --git a/lark/utils.py b/lark/utils.py
index 01c70a1..abe036f 100644
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -112,3 +112,10 @@ except NameError:
             return -1
 
 
+import sre_parse
+import sre_constants
+def get_regexp_width(regexp):
+    try:
+        return sre_parse.parse(regexp).getwidth()
+    except sre_constants.error:
+        raise ValueError(regexp)

From d173d6d66bc43b0000d7aa1e00b29426636e0b96 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Wed, 17 Jan 2018 10:38:51 +0200
Subject: [PATCH 20/21] Validate against zero-width terminals in XEarley (Issue
 #63)

---
 lark/parser_frontends.py | 6 +++++-
 tests/test_parser.py     | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
index db6cdcc..3865679 100644
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -122,9 +122,13 @@ class XEarley:
         for t in lexer_conf.tokens:
             regexp = t.pattern.to_regexp()
             try:
-                assert get_regexp_width(regexp)
+                width = get_regexp_width(regexp)[0]
             except ValueError:
                 raise ValueError("Bad regexp in token %s: %s" % (t.name, regexp))
+            else:
+                if width == 0:
+                    raise ValueError("Dynamic Earley doesn't allow zero-width regexps")
+
             self.regexps[t.name] = re.compile(regexp)
 
     def parse(self, text):
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 1c7cfcf..8e954e2 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -159,7 +159,7 @@ def _make_full_earley_test(LEXER):
             # Fails an Earley implementation without special handling for empty rules,
             # or re-processing of already completed rules.
             g = Lark(r"""start: B
-                         B: ("ab"|/[^b]/)*
+                         B: ("ab"|/[^b]/)+
                       """, lexer=LEXER)
 
             self.assertEqual( g.parse('abc').children[0], 'abc')

From b002ec47fb7879cafd1cf5abd56b4860241efe81 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Wed, 17 Jan 2018 10:49:52 +0200
Subject: [PATCH 21/21] BUGFIX: Repeating subrules are now allowed (Issue #61)

---
 lark/load_grammar.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index 16dc0d9..2d01277 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -176,7 +176,6 @@ class SimplifyRule_Visitor(Visitor):
                 break
             tree.expand_kids_by_index(*to_expand)
 
-
     def expansion(self, tree):
         # rules_list unpacking
         # a : b (c|d) e
@@ -210,7 +209,10 @@ class SimplifyRule_Visitor(Visitor):
             tree.data = 'expansions'
             tree.children = aliases
 
-    expansions = _flatten
+    def expansions(self, tree):
+        self._flatten(tree)
+        tree.children = list(set(tree.children))
+
 
 class RuleTreeToText(Transformer):
     def expansions(self, x):