From 7c253b9372a0cb29f0dcaefc6e541c8308b6a532 Mon Sep 17 00:00:00 2001
From: Ehud Tamir <ehudt@google.com>
Date: Tue, 9 Jan 2018 14:53:01 +0200
Subject: [PATCH 1/4] Merge CYK parser.

---
 LICENSE                  |   1 +
 lark/lark.py             |   8 +-
 lark/parser_frontends.py |  55 +++++-
 lark/parsers/cyk.py      | 401 +++++++++++++++++++++++++++++++++++++++
 lark/parsers/earley.py   |   3 +-
 lark/tree.py             |   3 +-
 6 files changed, 464 insertions(+), 7 deletions(-)
 create mode 100644 lark/parsers/cyk.py

diff --git a/LICENSE b/LICENSE
index efcb966..737149b 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,5 @@
 Copyright © 2017 Erez Shinan
+Copyright (c) 2018 Google LLC
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
diff --git a/lark/lark.py b/lark/lark.py
index d8ee186..8029638 100644
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -59,7 +59,7 @@ class LarkOptions(object):
         self.propagate_positions = o.pop('propagate_positions', False)
         self.earley__predict_all = o.pop('earley__predict_all', False)
 
-        assert self.parser in ('earley', 'lalr', None)
+        assert self.parser in ('earley', 'lalr', 'cyk', None)
 
         if self.parser == 'earley' and self.transformer:
             raise ValueError('Cannot specify an embedded transformer when using the Earley algorithm.'
@@ -131,6 +131,8 @@ class Lark:
                 self.options.lexer = 'standard'
             elif self.options.parser == 'earley':
                 self.options.lexer = 'dynamic'
+            elif self.options.parser == 'cyk':
+                self.options.lexer = 'standard'
             else:
                 assert False, self.options.parser
         lexer = self.options.lexer
@@ -140,7 +142,9 @@ class Lark:
             if self.options.parser == 'earley':
                 self.options.ambiguity = 'resolve'
         else:
-            assert self.options.parser == 'earley', "Only Earley supports disambiguation right now"
+            disambig_parsers = ['earley', 'cyk']
+            assert self.options.parser in disambig_parsers, (
+                'Only %s supports disambiguation right now') % ', '.join(disambig_parsers)
         assert self.options.ambiguity in ('resolve', 'explicit', 'auto', 'resolve__antiscore_sum')
 
         # Parse the grammar file and compose the grammars (TODO)
diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
index 718a0f9..37c6dd0 100644
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -1,10 +1,12 @@
 import re
 import sre_parse
 
+from parsers.grammar_analysis import GrammarAnalyzer
 from .lexer import Lexer, ContextualLexer, Token
 
-from .common import is_terminal, GrammarError, ParserConf, Terminal_Regexp, Terminal_Token
-from .parsers import lalr_parser, earley, xearley, resolve_ambig
+from .common import is_terminal, GrammarError, Terminal_Regexp, Terminal_Token
+from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk
+from .tree import Tree
 
 class WithLexer:
     def __init__(self, lexer_conf):
@@ -142,6 +144,50 @@ class XEarley:
     def parse(self, text):
         return self.parser.parse(text)
 
+
+class CYK(WithLexer):
+
+  def __init__(self, lexer_conf, parser_conf, options=None):
+    WithLexer.__init__(self, lexer_conf)
+    # TokenDef from synthetic rule to terminal value
+    self._token_by_name = {t.name: t for t in lexer_conf.tokens}
+    rules = [(lhs, self._prepare_expansion(rhs), cb, opt) for lhs, rhs, cb, opt in parser_conf.rules]
+    self._analysis = GrammarAnalyzer(rules, parser_conf.start)
+    self._parser = cyk.Parser(self._analysis.rules, parser_conf.start)
+
+    self._postprocess = {}
+    for rule in self._analysis.rules:
+        if rule.origin != '$root':  # XXX kinda ugly
+            a = rule.alias
+            self._postprocess[a] = a if callable(a) else (a and getattr(parser_conf.callback, a))
+
+  def _prepare_expansion(self, expansion):
+    return [
+        Terminal_Regexp(sym, self._token_by_name[sym].pattern.to_regexp())
+        if is_terminal(sym) else sym for sym in expansion
+    ]
+
+  def parse(self, text):
+    tokenized = [token.value for token in self.lex(text)]
+    parse = self._parser.parse(tokenized)
+    parse = self._transform(parse)
+    return parse
+
+  def _transform(self, tree):
+    subtrees = list(tree.iter_subtrees())
+    for subtree in subtrees:
+      subtree.children = [self._apply_callback(c) if isinstance(c, Tree) else c for c in subtree.children]
+
+    return self._apply_callback(tree)
+
+  def _apply_callback(self, tree):
+    children = tree.children
+    callback = self._postprocess[tree.rule.alias]
+    assert callback, tree.rule.alias
+    r = callback(children)
+    return r
+
+
 def get_frontend(parser, lexer):
     if parser=='lalr':
         if lexer is None:
@@ -163,6 +209,11 @@ def get_frontend(parser, lexer):
             raise ValueError('The Earley parser does not support the contextual parser')
         else:
             raise ValueError('Unknown lexer: %s' % lexer)
+    elif parser == 'cyk':
+        if lexer == 'standard':
+            return CYK
+        else:
+            raise ValueError('CYK parser requires using standard parser.')
     else:
         raise ValueError('Unknown parser: %s' % parser)
 
diff --git a/lark/parsers/cyk.py b/lark/parsers/cyk.py
new file mode 100644
index 0000000..08cb0fd
--- /dev/null
+++ b/lark/parsers/cyk.py
@@ -0,0 +1,401 @@
+"""This module implements a CYK parser."""
+from collections import defaultdict
+import itertools
+import re
+
+from ..common import ParseError, Terminal, Terminal_Regexp
+from ..lexer import Token
+from ..tree import Tree
+
+
+def TypeName(x):
+  return type(x).__name__
+
+
+class Symbol(object):
+  """Any grammar symbol."""
+
+  def __init__(self, s):
+    self.s = s
+
+  def __repr__(self):
+    return '%s(%s)' % (TypeName(self), str(self))
+
+  def __str__(self):
+    return str(self.s)
+
+  def __eq__(self, other):
+    return str(self) == str(other)
+
+  def __ne__(self, other):
+    return not self.__eq__(other)
+
+  def __hash__(self):
+    return hash(TypeName(self) + '&' + self.__str__())
+
+
+class T(Symbol):
+  """Terminal."""
+
+  def __init__(self, s):
+    super(T, self).__init__(s)
+    self.regexp = re.compile(s)
+
+  def match(self, s):
+    m = self.regexp.match(s)
+    return bool(m) and len(m.group(0)) == len(s)
+
+  def __eq__(self, other):
+    return super(T, self).__eq__(other) and isinstance(other, T)
+
+
+class NT(Symbol):
+  """Non-terminal."""
+
+  def __eq__(self, other):
+    return super(NT, self).__eq__(other) and isinstance(other, NT)
+
+
+class Rule(object):
+  """Context-free grammar rule."""
+
+  def __init__(self, lhs, rhs, weight, alias):
+    super(Rule, self).__init__()
+    assert isinstance(lhs, NT), lhs
+    assert all(isinstance(x, NT) or isinstance(x, T) for x in rhs), rhs
+    self.lhs = lhs
+    self.rhs = rhs
+    self.weight = weight
+    self.alias = alias
+
+  def __str__(self):
+    return '%s -> %s' % (str(self.lhs), ' '.join(str(x) for x in self.rhs))
+
+  def __repr__(self):
+    return str(self)
+
+  def __hash__(self):
+    return hash(self.__repr__())
+
+  def __eq__(self, other):
+    return self.lhs == other.lhs and self.rhs == other.rhs
+
+  def __ne__(self, other):
+    return not self.__eq__(other)
+
+
+class Grammar(object):
+  """Context-free grammar."""
+
+  def __init__(self, rules):
+    super(Grammar, self).__init__()
+    self.rules = sorted(rules, key=lambda x: str(x))
+
+  def __eq__(self, other):
+    return set(self.rules) == set(other.rules)
+
+  def __str__(self):
+    return '\n' + '\n'.join(sorted(x.__repr__() for x in self.rules)) + '\n'
+
+  def __repr__(self):
+    return str(self)
+
+
+# Parse tree data structures
+class RuleNode(object):
+  """A node in the parse tree, which also contains the full rhs rule."""
+
+  def __init__(self, rule, children, weight=0):
+    super(RuleNode, self).__init__()
+    self.rule = rule
+    self.children = children
+    self.weight = weight
+
+  def __repr__(self):
+    return 'RuleNode(%s, [%s])' % (repr(self.rule.lhs), ', '.join(
+        str(x) for x in self.children))
+
+  def __hash__(self):
+    return hash(self.__repr__())
+
+
+class Node(object):
+  """A node in the parse tree."""
+
+  def __init__(self, lhs, children):
+    super(Node, self).__init__()
+    self.lhs = lhs
+    self.children = children
+
+  def __repr__(self):
+    return 'Node(%s, [%s])' % (repr(self.lhs), ', '.join(
+        str(x) for x in self.children))
+
+  def __hash__(self):
+    return hash(self.__repr__())
+
+
+class Parser(object):
+  """Parser wrapper."""
+
+  def __init__(self, rules, start):
+    super(Parser, self).__init__()
+    self.orig_rules = {rule.alias: rule for rule in rules}
+    rules = [self._ToRule(rule) for rule in rules]
+    self.grammar = ToCnf(Grammar(rules))
+    self.start = NT(start)
+
+  def _ToRule(self, lark_rule):
+    """Converts a lark rule, (lhs, rhs, callback, options), to a Rule."""
+    return Rule(
+        NT(lark_rule.origin), [
+            T(x.data) if (isinstance(x, Terminal_Regexp) or
+                          isinstance(x, Terminal)) else NT(x)
+            for x in lark_rule.expansion
+        ], weight=lark_rule.options.priority if lark_rule.options and lark_rule.options.priority else 0, alias=lark_rule.alias)
+
+  def parse(self, tokenized):  # pylint: disable=invalid-name
+    """Parses input, which is a list of tokens."""
+    table, trees = _Parse(tokenized, self.grammar)
+    # Check if the parse succeeded.
+    if all(r.lhs != self.start for r in table[(0, len(tokenized) - 1)]):
+      raise ParseError('Parsing failed.')
+    parse = trees[(0, len(tokenized) - 1)][NT(self.start)]
+    return self._ToTree(RevertCnf(parse))
+
+  def _ToTree(self, rule_node):
+    """Converts a RuleNode parse tree to a lark Tree."""
+    orig_rule = self.orig_rules[rule_node.rule.alias]
+    children = []
+    for i, child in enumerate(rule_node.children):
+      if isinstance(child, RuleNode):
+        children.append(self._ToTree(child))
+      elif isinstance(child, Terminal_Regexp):
+        children.append(Token(orig_rule.expansion[i].name, child.s))
+      else:
+        children.append(Token(orig_rule.expansion[i], child.s))
+    return Tree(orig_rule.origin, children, rule=orig_rule)
+
+
+def PrintParse(node, indent=0):
+  if isinstance(node, RuleNode):
+    print(' ' * (indent * 2) + str(node.rule.lhs))
+    for child in node.children:
+      PrintParse(child, indent + 1)
+  else:
+    print(' ' * (indent * 2) + str(node.s))
+
+
+def _Parse(s, g):
+  """Parses sentence 's' using CNF grammar 'g'."""
+  # The CYK table. Indexed with a 2-tuple: (start pos, end pos)
+  table = defaultdict(set)
+  # Top-level structure is similar to the CYK table. Each cell is a dict from
+  # rule name to the best (lightest) tree for that rule.
+  trees = defaultdict(dict)
+  # Populate base case with existing terminal production rules
+  for i, w in enumerate(s):
+    for terminal, rules in g.terminal_rules.iteritems():
+      if terminal.match(w):
+        for rule in rules:
+          table[(i, i)].add(rule)
+          if (rule.lhs not in trees[(i, i)] or
+              rule.weight < trees[(i, i)][rule.lhs].weight):
+            trees[(i, i)][rule.lhs] = RuleNode(rule, [T(w)], weight=rule.weight)
+  # Iterate over lengths of sub-sentences
+  for l in xrange(2, len(s) + 1):
+    # Iterate over sub-sentences with the given length
+    for i in xrange(len(s) - l + 1):
+      # Choose partition of the sub-sentence in [1, l)
+      for p in xrange(i + 1, i + l):
+        span1 = (i, p - 1)
+        span2 = (p, i + l - 1)
+        for r1, r2 in itertools.product(table[span1], table[span2]):
+          for rule in g.nonterminal_rules.get((r1.lhs, r2.lhs), []):
+            table[(i, i + l - 1)].add(rule)
+            r1_tree = trees[span1][r1.lhs]
+            r2_tree = trees[span2][r2.lhs]
+            rule_total_weight = rule.weight + r1_tree.weight + r2_tree.weight
+            if (rule.lhs not in trees[(i, i + l - 1)] or
+                rule_total_weight < trees[(i, i + l - 1)][rule.lhs].weight):
+              trees[(i, i + l - 1)][rule.lhs] = RuleNode(rule, [r1_tree, r2_tree], weight=rule_total_weight)
+  return table, trees
+
+
+# This section implements context-free grammar converter to Chomsky normal form.
+# It also implements a conversion of parse trees from its CNF to the original
+# grammar.
+# Overview:
+# Applies the following operations in this order:
+# * TERM: Eliminates non-solitary terminals from all rules
+# * BIN: Eliminates rules with more than 2 symbols on their right-hand-side.
+# * UNIT: Eliminates non-terminal unit rules
+#
+# The following grammar characteristics aren't featured:
+# * Start symbol appears on RHS
+# * Empty rules (epsilon rules)
+
+
+class CnfWrapper(object):
+  """CNF wrapper for grammar.
+
+  Validates that the input grammar is CNF and provides helper data structures.
+  """
+
+  def __init__(self, grammar):
+    super(CnfWrapper, self).__init__()
+    self.grammar = grammar
+    self.rules = grammar.rules
+    self.terminal_rules = defaultdict(list)
+    self.nonterminal_rules = defaultdict(list)
+    for r in self.rules:
+      # Validate that the grammar is CNF and populate auxiliary data structures.
+      assert isinstance(r.lhs, NT), r
+      assert len(r.rhs) in [1, 2], r
+      if len(r.rhs) == 1 and isinstance(r.rhs[0], T):
+        self.terminal_rules[r.rhs[0]].append(r)
+      elif len(r.rhs) == 2 and all(isinstance(x, NT) for x in r.rhs):
+        self.nonterminal_rules[tuple(r.rhs)].append(r)
+      else:
+        assert False, r
+
+  def __eq__(self, other):
+    return self.grammar == other.grammar
+
+  def __repr__(self):
+    return self.grammar.__repr__()
+
+
+class UnitSkipRule(Rule):
+  """A rule that records NTs that were skipped during transformation."""
+
+  def __init__(self, lhs, rhs, skipped_rules, weight, alias):
+    super(UnitSkipRule, self).__init__(lhs, rhs, weight, alias)
+    self.skipped_rules = skipped_rules
+
+  def __eq__(self, other):
+    return (super(UnitSkipRule, self).__eq__(other) and
+            isinstance(other, type(self)) and
+            self.skipped_rules == other.skipped_rules)
+
+
+def BuildUnitSkipRule(unit_rule, target_rule):
+  skipped_rules = []
+  if isinstance(unit_rule, UnitSkipRule):
+    skipped_rules += unit_rule.skipped_rules
+  skipped_rules.append(target_rule)
+  if isinstance(target_rule, UnitSkipRule):
+    skipped_rules += target_rule.skipped_rules
+  return UnitSkipRule(unit_rule.lhs, target_rule.rhs, skipped_rules,
+                      weight=unit_rule.weight + target_rule.weight, alias=unit_rule.alias)
+
+
+def GetAnyNtUnitRule(g):
+  """Returns a non-terminal unit rule from 'g', or None if there is none."""
+  for rule in g.rules:
+    if len(rule.rhs) == 1 and isinstance(rule.rhs[0], NT):
+      return rule
+  return None
+
+
+def RemoveUnitRule(g, rule):
+  """Removes 'rule' from 'g' without changing the langugage produced by 'g'."""
+  new_rules = [x for x in g.rules if x != rule]
+  refs = [x for x in g.rules if x.lhs == rule.rhs[0]]
+  for ref in refs:
+    new_rules.append(BuildUnitSkipRule(rule, ref))
+  return Grammar(new_rules)
+
+
+def Split(rule):
+  """Splits a rule whose len(rhs) > 2 into shorter rules."""
+  # if len(rule.rhs) <= 2:
+  #   return [rule]
+  rule_str = str(rule.lhs) + '__' + '_'.join(str(x) for x in rule.rhs)
+  rule_name = '__SP_%s' % (rule_str) + '_%d'
+  new_rules = [Rule(rule.lhs, [rule.rhs[0], NT(rule_name % 1)], weight=rule.weight, alias=rule.alias)]
+  for i in xrange(1, len(rule.rhs) - 2):
+    new_rules.append(
+        Rule(NT(rule_name % i),
+             [rule.rhs[i], NT(rule_name % (i + 1))], weight=0, alias='Split'))
+  new_rules.append(Rule(NT(rule_name % (len(rule.rhs) - 2)), rule.rhs[-2:], weight=0, alias='Split'))
+  return new_rules
+
+
+def Term(g):
+  """Applies the TERM rule on 'g' (see top comment)."""
+  all_t = {x for rule in g.rules for x in rule.rhs if isinstance(x, T)}
+  t_rules = {t: Rule(NT('__T_%s' % str(t)), [t], weight=0, alias='Term') for t in all_t}
+  new_rules = []
+  for rule in g.rules:
+    if len(rule.rhs) > 1 and any(isinstance(x, T) for x in rule.rhs):
+      new_rhs = [t_rules[x].lhs if isinstance(x, T) else x for x in rule.rhs]
+      new_rules.append(Rule(rule.lhs, new_rhs, weight=rule.weight, alias=rule.alias))
+      new_rules.extend(v for k, v in t_rules.iteritems() if k in rule.rhs)
+    else:
+      new_rules.append(rule)
+  return Grammar(new_rules)
+
+
+def Bin(g):
+  """Applies the BIN rule to 'g' (see top comment)."""
+  new_rules = []
+  for rule in g.rules:
+    if len(rule.rhs) > 2:
+      new_rules.extend(Split(rule))
+    else:
+      new_rules.append(rule)
+  return Grammar(new_rules)
+
+
+def Unit(g):
+  """Applies the UNIT rule to 'g' (see top comment)."""
+  nt_unit_rule = GetAnyNtUnitRule(g)
+  while nt_unit_rule:
+    g = RemoveUnitRule(g, nt_unit_rule)
+    nt_unit_rule = GetAnyNtUnitRule(g)
+  return g
+
+
+def ToCnf(g):
+  """Creates a CNF grammar from a general context-free grammar 'g'."""
+  g = Unit(Bin(Term(g)))
+  return CnfWrapper(g)
+
+
+def UnrollUnitSkipRule(lhs, orig_rhs, skipped_rules, children, weight, alias):
+  if not skipped_rules:
+    return RuleNode(Rule(lhs, orig_rhs, weight=weight, alias=alias), children, weight=weight)
+  else:
+    weight = weight - skipped_rules[0].weight
+    return RuleNode(
+        Rule(lhs, [skipped_rules[0].lhs], weight=weight, alias=alias), [
+            UnrollUnitSkipRule(skipped_rules[0].lhs, orig_rhs,
+                               skipped_rules[1:], children,
+                               skipped_rules[0].weight, skipped_rules[0].alias)
+        ], weight=weight)
+
+
+def RevertCnf(node):
+  """Reverts a parse tree (RuleNode) to its original non-CNF form (Node)."""
+  if isinstance(node, T):
+    return node
+  # Reverts TERM rule.
+  if node.rule.lhs.s.startswith('__T_'):
+    return node.children[0]
+  else:
+    children = []
+    reverted_children = [RevertCnf(x) for x in node.children]
+    for child in reverted_children:
+      # Reverts BIN rule.
+      if isinstance(child, RuleNode) and child.rule.lhs.s.startswith('__SP_'):
+        children.extend(child.children)
+      else:
+        children.append(child)
+    # Reverts UNIT rule.
+    if isinstance(node.rule, UnitSkipRule):
+      return UnrollUnitSkipRule(node.rule.lhs, node.rule.rhs,
+                                node.rule.skipped_rules, children,
+                                node.rule.weight, node.rule.alias)
+    else:
+      return RuleNode(node.rule, children)
diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py
index 55893f5..dbe6834 100644
--- a/lark/parsers/earley.py
+++ b/lark/parsers/earley.py
@@ -25,8 +25,7 @@ class Derivation(Tree):
     _hash = None
 
     def __init__(self, rule, items=None):
-        Tree.__init__(self, 'drv', items or [])
-        self.rule = rule
+        Tree.__init__(self, 'drv', items or [], rule=rule)
 
     def _pretty_label(self):    # Nicer pretty for debugging the parser
         return self.rule.origin if self.rule else self.data
diff --git a/lark/tree.py b/lark/tree.py
index f832857..7251ce6 100644
--- a/lark/tree.py
+++ b/lark/tree.py
@@ -8,9 +8,10 @@ from copy import deepcopy
 from .utils import inline_args
 
 class Tree(object):
-    def __init__(self, data, children):
+    def __init__(self, data, children, rule=None):
         self.data = data
         self.children = list(children)
+        self.rule = rule
 
     def __repr__(self):
         return 'Tree(%s, %s)' % (self.data, self.children)

From 748e9b7248e788f8ff0e8ab767379158f922ad9d Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Tue, 23 Jan 2018 10:19:21 +0200
Subject: [PATCH 2/4] All relevant tests passing. Also indentation and other
 refactoring.

---
 lark/parser_frontends.py |  42 +--
 lark/parsers/cyk.py      | 547 ++++++++++++++++++---------------------
 tests/__main__.py        |   1 +
 tests/test_parser.py     |  12 +-
 4 files changed, 292 insertions(+), 310 deletions(-)

diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
index bc87921..b4a9a89 100644
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -1,10 +1,11 @@
 import re
 from .utils import get_regexp_width
 
-from parsers.grammar_analysis import GrammarAnalyzer
+from .parsers.grammar_analysis import GrammarAnalyzer
 from .lexer import Lexer, ContextualLexer, Token
 
-from .common import is_terminal, GrammarError, Terminal_Regexp, Terminal_Token
+from .common import GrammarError
+from .common import is_terminal, GrammarError
 from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk
 from .tree import Tree
 
@@ -137,31 +138,36 @@ class XEarley:
         return self.parser.parse(text)
 
 
+class Earley(WithLexer):
+    def __init__(self, lexer_conf, parser_conf, options=None):
+        self.init_traditional_lexer(lexer_conf)
+
+        self.parser = earley.Parser(parser_conf, self.match,
+                                    resolve_ambiguity=get_ambiguity_resolver(options))
+
+    def match(self, term, token):
+        return term == token.type
+
+    def parse(self, text):
+        tokens = self.lex(text)
+        return self.parser.parse(tokens)
+
 class CYK(WithLexer):
 
   def __init__(self, lexer_conf, parser_conf, options=None):
-    WithLexer.__init__(self, lexer_conf)
-    # TokenDef from synthetic rule to terminal value
-    self._token_by_name = {t.name: t for t in lexer_conf.tokens}
-    rules = [(lhs, self._prepare_expansion(rhs), cb, opt) for lhs, rhs, cb, opt in parser_conf.rules]
-    self._analysis = GrammarAnalyzer(rules, parser_conf.start)
+    self.init_traditional_lexer(lexer_conf)
+
+    self._analysis = GrammarAnalyzer(parser_conf)
     self._parser = cyk.Parser(self._analysis.rules, parser_conf.start)
 
     self._postprocess = {}
     for rule in self._analysis.rules:
-        if rule.origin != '$root':  # XXX kinda ugly
-            a = rule.alias
-            self._postprocess[a] = a if callable(a) else (a and getattr(parser_conf.callback, a))
-
-  def _prepare_expansion(self, expansion):
-    return [
-        Terminal_Regexp(sym, self._token_by_name[sym].pattern.to_regexp())
-        if is_terminal(sym) else sym for sym in expansion
-    ]
+        a = rule.alias
+        self._postprocess[a] = a if callable(a) else (a and getattr(parser_conf.callback, a))
 
   def parse(self, text):
-    tokenized = [token.value for token in self.lex(text)]
-    parse = self._parser.parse(tokenized)
+    tokens = list(self.lex(text))
+    parse = self._parser.parse(tokens)
     parse = self._transform(parse)
     return parse
 
diff --git a/lark/parsers/cyk.py b/lark/parsers/cyk.py
index 08cb0fd..e23792c 100644
--- a/lark/parsers/cyk.py
+++ b/lark/parsers/cyk.py
@@ -1,225 +1,193 @@
 """This module implements a CYK parser."""
+
 from collections import defaultdict
 import itertools
-import re
 
-from ..common import ParseError, Terminal, Terminal_Regexp
+from ..common import ParseError, is_terminal
 from ..lexer import Token
 from ..tree import Tree
 
-
-def TypeName(x):
-  return type(x).__name__
-
+try:
+    xrange
+except NameError:
+    xrange = range
 
 class Symbol(object):
-  """Any grammar symbol."""
+    """Any grammar symbol."""
 
-  def __init__(self, s):
-    self.s = s
+    def __init__(self, s):
+        self.s = s
 
-  def __repr__(self):
-    return '%s(%s)' % (TypeName(self), str(self))
+    def __repr__(self):
+        return '%s(%s)' % (type(self).__name__, str(self))
 
-  def __str__(self):
-    return str(self.s)
+    def __str__(self):
+        return str(self.s)
 
-  def __eq__(self, other):
-    return str(self) == str(other)
+    def __eq__(self, other):
+        return self.s == str(other)
 
-  def __ne__(self, other):
-    return not self.__eq__(other)
+    def __ne__(self, other):
+        return not self.__eq__(other)
 
-  def __hash__(self):
-    return hash(TypeName(self) + '&' + self.__str__())
+    def __hash__(self):
+        return hash((type(self), str(self.s)))
 
 
 class T(Symbol):
-  """Terminal."""
-
-  def __init__(self, s):
-    super(T, self).__init__(s)
-    self.regexp = re.compile(s)
+    """Terminal."""
 
-  def match(self, s):
-    m = self.regexp.match(s)
-    return bool(m) and len(m.group(0)) == len(s)
-
-  def __eq__(self, other):
-    return super(T, self).__eq__(other) and isinstance(other, T)
+    def match(self, s):
+        return self.s == s.type
 
 
 class NT(Symbol):
-  """Non-terminal."""
-
-  def __eq__(self, other):
-    return super(NT, self).__eq__(other) and isinstance(other, NT)
+    """Non-terminal."""
+    pass
 
 
 class Rule(object):
-  """Context-free grammar rule."""
+    """Context-free grammar rule."""
 
-  def __init__(self, lhs, rhs, weight, alias):
-    super(Rule, self).__init__()
-    assert isinstance(lhs, NT), lhs
-    assert all(isinstance(x, NT) or isinstance(x, T) for x in rhs), rhs
-    self.lhs = lhs
-    self.rhs = rhs
-    self.weight = weight
-    self.alias = alias
+    def __init__(self, lhs, rhs, weight, alias):
+        super(Rule, self).__init__()
+        assert isinstance(lhs, NT), lhs
+        assert all(isinstance(x, NT) or isinstance(x, T) for x in rhs), rhs
+        self.lhs = lhs
+        self.rhs = rhs
+        self.weight = weight
+        self.alias = alias
 
-  def __str__(self):
-    return '%s -> %s' % (str(self.lhs), ' '.join(str(x) for x in self.rhs))
+    def __str__(self):
+        return '%s -> %s' % (str(self.lhs), ' '.join(str(x) for x in self.rhs))
 
-  def __repr__(self):
-    return str(self)
+    def __repr__(self):
+        return str(self)
 
-  def __hash__(self):
-    return hash(self.__repr__())
+    def __hash__(self):
+        return hash((self.lhs, tuple(self.rhs)))
 
-  def __eq__(self, other):
-    return self.lhs == other.lhs and self.rhs == other.rhs
+    def __eq__(self, other):
+        return self.lhs == other.lhs and self.rhs == other.rhs
 
-  def __ne__(self, other):
-    return not self.__eq__(other)
+    def __ne__(self, other):
+        return not (self == other)
 
 
 class Grammar(object):
-  """Context-free grammar."""
+    """Context-free grammar."""
 
-  def __init__(self, rules):
-    super(Grammar, self).__init__()
-    self.rules = sorted(rules, key=lambda x: str(x))
+    def __init__(self, rules):
+        super(Grammar, self).__init__()
+        self.rules = rules
 
-  def __eq__(self, other):
-    return set(self.rules) == set(other.rules)
+    def __eq__(self, other):
+        return set(self.rules) == set(other.rules)
 
-  def __str__(self):
-    return '\n' + '\n'.join(sorted(x.__repr__() for x in self.rules)) + '\n'
+    def __str__(self):
+        return '\n' + '\n'.join(sorted(repr(x) for x in self.rules)) + '\n'
 
-  def __repr__(self):
-    return str(self)
+    def __repr__(self):
+        return str(self)
 
 
 # Parse tree data structures
 class RuleNode(object):
-  """A node in the parse tree, which also contains the full rhs rule."""
-
-  def __init__(self, rule, children, weight=0):
-    super(RuleNode, self).__init__()
-    self.rule = rule
-    self.children = children
-    self.weight = weight
-
-  def __repr__(self):
-    return 'RuleNode(%s, [%s])' % (repr(self.rule.lhs), ', '.join(
-        str(x) for x in self.children))
+    """A node in the parse tree, which also contains the full rhs rule."""
 
-  def __hash__(self):
-    return hash(self.__repr__())
+    def __init__(self, rule, children, weight=0):
+        self.rule = rule
+        self.children = children
+        self.weight = weight
 
+    def __repr__(self):
+        return 'RuleNode(%s, [%s])' % (repr(self.rule.lhs), ', '.join(str(x) for x in self.children))
 
-class Node(object):
-  """A node in the parse tree."""
-
-  def __init__(self, lhs, children):
-    super(Node, self).__init__()
-    self.lhs = lhs
-    self.children = children
-
-  def __repr__(self):
-    return 'Node(%s, [%s])' % (repr(self.lhs), ', '.join(
-        str(x) for x in self.children))
-
-  def __hash__(self):
-    return hash(self.__repr__())
 
 
 class Parser(object):
-  """Parser wrapper."""
-
-  def __init__(self, rules, start):
-    super(Parser, self).__init__()
-    self.orig_rules = {rule.alias: rule for rule in rules}
-    rules = [self._ToRule(rule) for rule in rules]
-    self.grammar = ToCnf(Grammar(rules))
-    self.start = NT(start)
-
-  def _ToRule(self, lark_rule):
-    """Converts a lark rule, (lhs, rhs, callback, options), to a Rule."""
-    return Rule(
-        NT(lark_rule.origin), [
-            T(x.data) if (isinstance(x, Terminal_Regexp) or
-                          isinstance(x, Terminal)) else NT(x)
-            for x in lark_rule.expansion
-        ], weight=lark_rule.options.priority if lark_rule.options and lark_rule.options.priority else 0, alias=lark_rule.alias)
-
-  def parse(self, tokenized):  # pylint: disable=invalid-name
-    """Parses input, which is a list of tokens."""
-    table, trees = _Parse(tokenized, self.grammar)
-    # Check if the parse succeeded.
-    if all(r.lhs != self.start for r in table[(0, len(tokenized) - 1)]):
-      raise ParseError('Parsing failed.')
-    parse = trees[(0, len(tokenized) - 1)][NT(self.start)]
-    return self._ToTree(RevertCnf(parse))
-
-  def _ToTree(self, rule_node):
-    """Converts a RuleNode parse tree to a lark Tree."""
-    orig_rule = self.orig_rules[rule_node.rule.alias]
-    children = []
-    for i, child in enumerate(rule_node.children):
-      if isinstance(child, RuleNode):
-        children.append(self._ToTree(child))
-      elif isinstance(child, Terminal_Regexp):
-        children.append(Token(orig_rule.expansion[i].name, child.s))
-      else:
-        children.append(Token(orig_rule.expansion[i], child.s))
-    return Tree(orig_rule.origin, children, rule=orig_rule)
+    """Parser wrapper."""
+
+    def __init__(self, rules, start):
+        super(Parser, self).__init__()
+        self.orig_rules = {rule.alias: rule for rule in rules}
+        rules = [self._ToRule(rule) for rule in rules]
+        self.grammar = ToCnf(Grammar(rules))
+        self.start = NT(start)
+
+    def _ToRule(self, lark_rule):
+        """Converts a lark rule, (lhs, rhs, callback, options), to a Rule."""
+        return Rule(
+            NT(lark_rule.origin), [
+                T(x) if is_terminal(x) else NT(x) for x in lark_rule.expansion
+            ], weight=lark_rule.options.priority if lark_rule.options and lark_rule.options.priority else 0, alias=lark_rule.alias)
+
+    def parse(self, tokenized):  # pylint: disable=invalid-name
+        """Parses input, which is a list of tokens."""
+        table, trees = _parse(tokenized, self.grammar)
+        # Check if the parse succeeded.
+        if all(r.lhs != self.start for r in table[(0, len(tokenized) - 1)]):
+            raise ParseError('Parsing failed.')
+        parse = trees[(0, len(tokenized) - 1)][NT(self.start)]
+        return self._ToTree(RevertCnf(parse))
+
+    def _ToTree(self, rule_node):
+        """Converts a RuleNode parse tree to a lark Tree."""
+        orig_rule = self.orig_rules[rule_node.rule.alias]
+        children = []
+        for i, child in enumerate(rule_node.children):
+            if isinstance(child, RuleNode):
+                children.append(self._ToTree(child))
+            else:
+                assert isinstance(child.s, Token)
+                children.append(child.s)
+        return Tree(orig_rule.origin, children, rule=orig_rule)
 
 
 def PrintParse(node, indent=0):
-  if isinstance(node, RuleNode):
-    print(' ' * (indent * 2) + str(node.rule.lhs))
-    for child in node.children:
-      PrintParse(child, indent + 1)
-  else:
-    print(' ' * (indent * 2) + str(node.s))
-
-
-def _Parse(s, g):
-  """Parses sentence 's' using CNF grammar 'g'."""
-  # The CYK table. Indexed with a 2-tuple: (start pos, end pos)
-  table = defaultdict(set)
-  # Top-level structure is similar to the CYK table. Each cell is a dict from
-  # rule name to the best (lightest) tree for that rule.
-  trees = defaultdict(dict)
-  # Populate base case with existing terminal production rules
-  for i, w in enumerate(s):
-    for terminal, rules in g.terminal_rules.iteritems():
-      if terminal.match(w):
-        for rule in rules:
-          table[(i, i)].add(rule)
-          if (rule.lhs not in trees[(i, i)] or
-              rule.weight < trees[(i, i)][rule.lhs].weight):
-            trees[(i, i)][rule.lhs] = RuleNode(rule, [T(w)], weight=rule.weight)
-  # Iterate over lengths of sub-sentences
-  for l in xrange(2, len(s) + 1):
-    # Iterate over sub-sentences with the given length
-    for i in xrange(len(s) - l + 1):
-      # Choose partition of the sub-sentence in [1, l)
-      for p in xrange(i + 1, i + l):
-        span1 = (i, p - 1)
-        span2 = (p, i + l - 1)
-        for r1, r2 in itertools.product(table[span1], table[span2]):
-          for rule in g.nonterminal_rules.get((r1.lhs, r2.lhs), []):
-            table[(i, i + l - 1)].add(rule)
-            r1_tree = trees[span1][r1.lhs]
-            r2_tree = trees[span2][r2.lhs]
-            rule_total_weight = rule.weight + r1_tree.weight + r2_tree.weight
-            if (rule.lhs not in trees[(i, i + l - 1)] or
-                rule_total_weight < trees[(i, i + l - 1)][rule.lhs].weight):
-              trees[(i, i + l - 1)][rule.lhs] = RuleNode(rule, [r1_tree, r2_tree], weight=rule_total_weight)
-  return table, trees
+    if isinstance(node, RuleNode):
+        print(' ' * (indent * 2) + str(node.rule.lhs))
+        for child in node.children:
+            PrintParse(child, indent + 1)
+    else:
+        print(' ' * (indent * 2) + str(node.s))
+
+
+def _parse(s, g):
+    """Parses sentence 's' using CNF grammar 'g'."""
+    # The CYK table. Indexed with a 2-tuple: (start pos, end pos)
+    table = defaultdict(set)
+    # Top-level structure is similar to the CYK table. Each cell is a dict from
+    # rule name to the best (lightest) tree for that rule.
+    trees = defaultdict(dict)
+    # Populate base case with existing terminal production rules
+    for i, w in enumerate(s):
+        for terminal, rules in g.terminal_rules.items():
+            if terminal.match(w):
+                for rule in rules:
+                    table[(i, i)].add(rule)
+                    if (rule.lhs not in trees[(i, i)] or
+                        rule.weight < trees[(i, i)][rule.lhs].weight):
+                        trees[(i, i)][rule.lhs] = RuleNode(rule, [T(w)], weight=rule.weight)
+
+    # Iterate over lengths of sub-sentences
+    for l in xrange(2, len(s) + 1):
+        # Iterate over sub-sentences with the given length
+        for i in xrange(len(s) - l + 1):
+            # Choose partition of the sub-sentence in [1, l)
+            for p in xrange(i + 1, i + l):
+                span1 = (i, p - 1)
+                span2 = (p, i + l - 1)
+                for r1, r2 in itertools.product(table[span1], table[span2]):
+                    for rule in g.nonterminal_rules.get((r1.lhs, r2.lhs), []):
+                        table[(i, i + l - 1)].add(rule)
+                        r1_tree = trees[span1][r1.lhs]
+                        r2_tree = trees[span2][r2.lhs]
+                        rule_total_weight = rule.weight + r1_tree.weight + r2_tree.weight
+                        if (rule.lhs not in trees[(i, i + l - 1)]
+                            or rule_total_weight < trees[(i, i + l - 1)][rule.lhs].weight):
+                            trees[(i, i + l - 1)][rule.lhs] = RuleNode(rule, [r1_tree, r2_tree], weight=rule_total_weight)
+    return table, trees
 
 
 # This section implements context-free grammar converter to Chomsky normal form.
@@ -237,165 +205,162 @@ def _Parse(s, g):
 
 
 class CnfWrapper(object):
-  """CNF wrapper for grammar.
+    """CNF wrapper for grammar.
 
   Validates that the input grammar is CNF and provides helper data structures.
   """
 
-  def __init__(self, grammar):
-    super(CnfWrapper, self).__init__()
-    self.grammar = grammar
-    self.rules = grammar.rules
-    self.terminal_rules = defaultdict(list)
-    self.nonterminal_rules = defaultdict(list)
-    for r in self.rules:
-      # Validate that the grammar is CNF and populate auxiliary data structures.
-      assert isinstance(r.lhs, NT), r
-      assert len(r.rhs) in [1, 2], r
-      if len(r.rhs) == 1 and isinstance(r.rhs[0], T):
-        self.terminal_rules[r.rhs[0]].append(r)
-      elif len(r.rhs) == 2 and all(isinstance(x, NT) for x in r.rhs):
-        self.nonterminal_rules[tuple(r.rhs)].append(r)
-      else:
-        assert False, r
-
-  def __eq__(self, other):
-    return self.grammar == other.grammar
-
-  def __repr__(self):
-    return self.grammar.__repr__()
+    def __init__(self, grammar):
+        super(CnfWrapper, self).__init__()
+        self.grammar = grammar
+        self.rules = grammar.rules
+        self.terminal_rules = defaultdict(list)
+        self.nonterminal_rules = defaultdict(list)
+        for r in self.rules:
+            # Validate that the grammar is CNF and populate auxiliary data structures.
+            assert isinstance(r.lhs, NT), r
+            assert len(r.rhs) in [1, 2], r
+            if len(r.rhs) == 1 and isinstance(r.rhs[0], T):
+                self.terminal_rules[r.rhs[0]].append(r)
+            elif len(r.rhs) == 2 and all(isinstance(x, NT) for x in r.rhs):
+                self.nonterminal_rules[tuple(r.rhs)].append(r)
+            else:
+                assert False, r
+
+    def __eq__(self, other):
+        return self.grammar == other.grammar
+
+    def __repr__(self):
+        return repr(self.grammar)
 
 
 class UnitSkipRule(Rule):
-  """A rule that records NTs that were skipped during transformation."""
+    """A rule that records NTs that were skipped during transformation."""
 
-  def __init__(self, lhs, rhs, skipped_rules, weight, alias):
-    super(UnitSkipRule, self).__init__(lhs, rhs, weight, alias)
-    self.skipped_rules = skipped_rules
+    def __init__(self, lhs, rhs, skipped_rules, weight, alias):
+        super(UnitSkipRule, self).__init__(lhs, rhs, weight, alias)
+        self.skipped_rules = skipped_rules
 
-  def __eq__(self, other):
-    return (super(UnitSkipRule, self).__eq__(other) and
-            isinstance(other, type(self)) and
-            self.skipped_rules == other.skipped_rules)
+    def __eq__(self, other):
+        return isinstance(other, type(self)) and self.skipped_rules == other.skipped_rules
+
+    __hash__ = Rule.__hash__
 
 
 def BuildUnitSkipRule(unit_rule, target_rule):
-  skipped_rules = []
-  if isinstance(unit_rule, UnitSkipRule):
-    skipped_rules += unit_rule.skipped_rules
-  skipped_rules.append(target_rule)
-  if isinstance(target_rule, UnitSkipRule):
-    skipped_rules += target_rule.skipped_rules
-  return UnitSkipRule(unit_rule.lhs, target_rule.rhs, skipped_rules,
+    skipped_rules = []
+    if isinstance(unit_rule, UnitSkipRule):
+        skipped_rules += unit_rule.skipped_rules
+    skipped_rules.append(target_rule)
+    if isinstance(target_rule, UnitSkipRule):
+        skipped_rules += target_rule.skipped_rules
+    return UnitSkipRule(unit_rule.lhs, target_rule.rhs, skipped_rules,
                       weight=unit_rule.weight + target_rule.weight, alias=unit_rule.alias)
 
 
 def GetAnyNtUnitRule(g):
-  """Returns a non-terminal unit rule from 'g', or None if there is none."""
-  for rule in g.rules:
-    if len(rule.rhs) == 1 and isinstance(rule.rhs[0], NT):
-      return rule
-  return None
+    """Returns a non-terminal unit rule from 'g', or None if there is none."""
+    for rule in g.rules:
+        if len(rule.rhs) == 1 and isinstance(rule.rhs[0], NT):
+            return rule
+    return None
 
 
 def RemoveUnitRule(g, rule):
-  """Removes 'rule' from 'g' without changing the langugage produced by 'g'."""
-  new_rules = [x for x in g.rules if x != rule]
-  refs = [x for x in g.rules if x.lhs == rule.rhs[0]]
-  for ref in refs:
-    new_rules.append(BuildUnitSkipRule(rule, ref))
-  return Grammar(new_rules)
+    """Removes 'rule' from 'g' without changing the langugage produced by 'g'."""
+    new_rules = [x for x in g.rules if x != rule]
+    refs = [x for x in g.rules if x.lhs == rule.rhs[0]]
+    for ref in refs:
+        new_rules.append(BuildUnitSkipRule(rule, ref))
+    return Grammar(new_rules)
 
 
 def Split(rule):
-  """Splits a rule whose len(rhs) > 2 into shorter rules."""
-  # if len(rule.rhs) <= 2:
-  #   return [rule]
-  rule_str = str(rule.lhs) + '__' + '_'.join(str(x) for x in rule.rhs)
-  rule_name = '__SP_%s' % (rule_str) + '_%d'
-  new_rules = [Rule(rule.lhs, [rule.rhs[0], NT(rule_name % 1)], weight=rule.weight, alias=rule.alias)]
-  for i in xrange(1, len(rule.rhs) - 2):
-    new_rules.append(
-        Rule(NT(rule_name % i),
-             [rule.rhs[i], NT(rule_name % (i + 1))], weight=0, alias='Split'))
-  new_rules.append(Rule(NT(rule_name % (len(rule.rhs) - 2)), rule.rhs[-2:], weight=0, alias='Split'))
-  return new_rules
+    """Splits a rule whose len(rhs) > 2 into shorter rules."""
+    rule_str = str(rule.lhs) + '__' + '_'.join(str(x) for x in rule.rhs)
+    rule_name = '__SP_%s' % (rule_str) + '_%d'
+    new_rules = [Rule(rule.lhs, [rule.rhs[0], NT(rule_name % 1)], weight=rule.weight, alias=rule.alias)]
+    for i in xrange(1, len(rule.rhs) - 2):
+        new_rules.append( Rule(NT(rule_name % i),
+                         [rule.rhs[i], NT(rule_name % (i + 1))], weight=0, alias='Split'))
+    new_rules.append(Rule(NT(rule_name % (len(rule.rhs) - 2)), rule.rhs[-2:], weight=0, alias='Split'))
+    return new_rules
 
 
 def Term(g):
-  """Applies the TERM rule on 'g' (see top comment)."""
-  all_t = {x for rule in g.rules for x in rule.rhs if isinstance(x, T)}
-  t_rules = {t: Rule(NT('__T_%s' % str(t)), [t], weight=0, alias='Term') for t in all_t}
-  new_rules = []
-  for rule in g.rules:
-    if len(rule.rhs) > 1 and any(isinstance(x, T) for x in rule.rhs):
-      new_rhs = [t_rules[x].lhs if isinstance(x, T) else x for x in rule.rhs]
-      new_rules.append(Rule(rule.lhs, new_rhs, weight=rule.weight, alias=rule.alias))
-      new_rules.extend(v for k, v in t_rules.iteritems() if k in rule.rhs)
-    else:
-      new_rules.append(rule)
-  return Grammar(new_rules)
+    """Applies the TERM rule on 'g' (see top comment)."""
+    all_t = {x for rule in g.rules for x in rule.rhs if isinstance(x, T)}
+    t_rules = {t: Rule(NT('__T_%s' % str(t)), [t], weight=0, alias='Term') for t in all_t}
+    new_rules = []
+    for rule in g.rules:
+        if len(rule.rhs) > 1 and any(isinstance(x, T) for x in rule.rhs):
+            new_rhs = [t_rules[x].lhs if isinstance(x, T) else x for x in rule.rhs]
+            new_rules.append(Rule(rule.lhs, new_rhs, weight=rule.weight, alias=rule.alias))
+            new_rules.extend(v for k, v in t_rules.items() if k in rule.rhs)
+        else:
+            new_rules.append(rule)
+    return Grammar(new_rules)
 
 
 def Bin(g):
-  """Applies the BIN rule to 'g' (see top comment)."""
-  new_rules = []
-  for rule in g.rules:
-    if len(rule.rhs) > 2:
-      new_rules.extend(Split(rule))
-    else:
-      new_rules.append(rule)
-  return Grammar(new_rules)
+    """Applies the BIN rule to 'g' (see top comment)."""
+    new_rules = []
+    for rule in g.rules:
+        if len(rule.rhs) > 2:
+            new_rules.extend(Split(rule))
+        else:
+            new_rules.append(rule)
+    return Grammar(new_rules)
 
 
 def Unit(g):
-  """Applies the UNIT rule to 'g' (see top comment)."""
-  nt_unit_rule = GetAnyNtUnitRule(g)
-  while nt_unit_rule:
-    g = RemoveUnitRule(g, nt_unit_rule)
+    """Applies the UNIT rule to 'g' (see top comment)."""
     nt_unit_rule = GetAnyNtUnitRule(g)
-  return g
+    while nt_unit_rule:
+        g = RemoveUnitRule(g, nt_unit_rule)
+        nt_unit_rule = GetAnyNtUnitRule(g)
+    return g
 
 
 def ToCnf(g):
-  """Creates a CNF grammar from a general context-free grammar 'g'."""
-  g = Unit(Bin(Term(g)))
-  return CnfWrapper(g)
+    """Creates a CNF grammar from a general context-free grammar 'g'."""
+    g = Unit(Bin(Term(g)))
+    return CnfWrapper(g)
 
 
 def UnrollUnitSkipRule(lhs, orig_rhs, skipped_rules, children, weight, alias):
-  if not skipped_rules:
-    return RuleNode(Rule(lhs, orig_rhs, weight=weight, alias=alias), children, weight=weight)
-  else:
-    weight = weight - skipped_rules[0].weight
-    return RuleNode(
-        Rule(lhs, [skipped_rules[0].lhs], weight=weight, alias=alias), [
-            UnrollUnitSkipRule(skipped_rules[0].lhs, orig_rhs,
-                               skipped_rules[1:], children,
-                               skipped_rules[0].weight, skipped_rules[0].alias)
-        ], weight=weight)
+    if not skipped_rules:
+        return RuleNode(Rule(lhs, orig_rhs, weight=weight, alias=alias), children, weight=weight)
+    else:
+        weight = weight - skipped_rules[0].weight
+        return RuleNode(
+            Rule(lhs, [skipped_rules[0].lhs], weight=weight, alias=alias), [
+                UnrollUnitSkipRule(skipped_rules[0].lhs, orig_rhs,
+                                skipped_rules[1:], children,
+                                skipped_rules[0].weight, skipped_rules[0].alias)
+            ], weight=weight)
 
 
 def RevertCnf(node):
-  """Reverts a parse tree (RuleNode) to its original non-CNF form (Node)."""
-  if isinstance(node, T):
-    return node
-  # Reverts TERM rule.
-  if node.rule.lhs.s.startswith('__T_'):
-    return node.children[0]
-  else:
-    children = []
-    reverted_children = [RevertCnf(x) for x in node.children]
-    for child in reverted_children:
-      # Reverts BIN rule.
-      if isinstance(child, RuleNode) and child.rule.lhs.s.startswith('__SP_'):
-        children.extend(child.children)
-      else:
-        children.append(child)
-    # Reverts UNIT rule.
-    if isinstance(node.rule, UnitSkipRule):
-      return UnrollUnitSkipRule(node.rule.lhs, node.rule.rhs,
-                                node.rule.skipped_rules, children,
-                                node.rule.weight, node.rule.alias)
+    """Reverts a parse tree (RuleNode) to its original non-CNF form (Node)."""
+    if isinstance(node, T):
+        return node
+    # Reverts TERM rule.
+    if node.rule.lhs.s.startswith('__T_'):
+        return node.children[0]
     else:
-      return RuleNode(node.rule, children)
+        children = []
+        reverted_children = [RevertCnf(x) for x in node.children]
+        for child in reverted_children:
+            # Reverts BIN rule.
+            if isinstance(child, RuleNode) and child.rule.lhs.s.startswith('__SP_'):
+                children.extend(child.children)
+            else:
+                children.append(child)
+        # Reverts UNIT rule.
+        if isinstance(node.rule, UnitSkipRule):
+            return UnrollUnitSkipRule(node.rule.lhs, node.rule.rhs,
+                                    node.rule.skipped_rules, children,
+                                    node.rule.weight, node.rule.alias)
+        else:
+            return RuleNode(node.rule, children)
diff --git a/tests/__main__.py b/tests/__main__.py
index 4ba32f7..4f7fdf7 100644
--- a/tests/__main__.py
+++ b/tests/__main__.py
@@ -16,6 +16,7 @@ except ImportError:
 from .test_parser import (
         TestLalrStandard,
         TestEarleyStandard,
+        TestCykStandard,
         TestLalrContextual,
         TestEarleyScanless,
         TestEarleyDynamic,
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 8e954e2..38ada24 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -382,6 +382,7 @@ def _make_parser_test(LEXER, PARSER):
             g.parse(u'\xa3\u0101\u00a3\u0203\n')
 
 
+        @unittest.skipIf(PARSER == 'cyk', "Takes forever")
         def test_stack_for_ebnf(self):
             """Verify that stack depth isn't an issue for EBNF grammars"""
             g = _Lark(r"""start: a+
@@ -455,6 +456,7 @@ def _make_parser_test(LEXER, PARSER):
 
 
 
+        @unittest.skipIf(PARSER == 'cyk', "No empty rules")
         def test_empty_expand1_list(self):
             g = _Lark(r"""start: list
                             ?list: item*
@@ -473,6 +475,7 @@ def _make_parser_test(LEXER, PARSER):
             [list] = r.children
             self.assertSequenceEqual([item.data for item in list.children], ())
 
+        @unittest.skipIf(PARSER == 'cyk', "No empty rules")
         def test_empty_expand1_list_2(self):
             g = _Lark(r"""start: list
                             ?list: item* "!"?
@@ -492,6 +495,7 @@ def _make_parser_test(LEXER, PARSER):
             self.assertSequenceEqual([item.data for item in list.children], ())
 
 
+        @unittest.skipIf(PARSER == 'cyk', "No empty rules")
         def test_empty_flatten_list(self):
             g = _Lark(r"""start: list
                             list: | item "," list
@@ -645,6 +649,7 @@ def _make_parser_test(LEXER, PARSER):
             self.assertEqual(len(x.children), 1, '/a/ should not be considered anonymous')
             self.assertEqual(x.children[0].type, "A")
 
+        @unittest.skipIf(PARSER == 'cyk', "No empty rules")
         def test_maybe(self):
             g = _Lark("""start: ["a"] """)
             x = g.parse('a')
@@ -702,6 +707,7 @@ def _make_parser_test(LEXER, PARSER):
         #                  B: A
         #               """)
 
+        @unittest.skipIf(PARSER == 'cyk', "No empty rules")
         def test_empty(self):
             # Fails an Earley implementation without special handling for empty rules,
             # or re-processing of already completed rules.
@@ -732,6 +738,8 @@ def _make_parser_test(LEXER, PARSER):
 
         def test_float_without_lexer(self):
             expected_error = UnexpectedInput if LEXER == 'dynamic' else UnexpectedToken
+            if PARSER == 'cyk':
+                expected_error = ParseError
 
             g = _Lark("""start: ["+"|"-"] float
                          float: digit* "." digit+ exp?
@@ -796,6 +804,7 @@ def _make_parser_test(LEXER, PARSER):
             self.assertEqual(tree.children, ['a', 'A'])
 
 
+        @unittest.skipIf(PARSER == 'cyk', "No empty rules")
         def test_twice_empty(self):
             g = """!start: [["A"]]
                 """
@@ -1001,6 +1010,7 @@ def _make_parser_test(LEXER, PARSER):
 
 
         @unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions")
+        @unittest.skipIf(PARSER == 'cyk', "No empty rules")
         def test_ignore(self):
             grammar = r"""
             COMMENT: /(!|(\/\/))[^\n]*/
@@ -1026,7 +1036,6 @@ def _make_parser_test(LEXER, PARSER):
             self.assertEqual(tree.children, [])
 
 
-
         @unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions")
         def test_regex_escaping(self):
             g = _Lark("start: /[ab]/")
@@ -1075,6 +1084,7 @@ def _make_parser_test(LEXER, PARSER):
 # Note: You still have to import them in __main__ for the tests to run
 _TO_TEST = [
         ('standard', 'earley'),
+        ('standard', 'cyk'),
         ('dynamic', 'earley'),
         ('standard', 'lalr'),
         ('contextual', 'lalr'),

From 648099d7b429ae5d5b2eff99cf018d49d4e59334 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Tue, 23 Jan 2018 14:44:45 +0200
Subject: [PATCH 3/4] Idiomatic function names and a few other style fixes

---
 lark/parsers/cyk.py | 71 +++++++++++++++++++++------------------------
 1 file changed, 33 insertions(+), 38 deletions(-)

diff --git a/lark/parsers/cyk.py b/lark/parsers/cyk.py
index e23792c..fbd52a9 100644
--- a/lark/parsers/cyk.py
+++ b/lark/parsers/cyk.py
@@ -78,11 +78,10 @@ class Grammar(object):
     """Context-free grammar."""
 
     def __init__(self, rules):
-        super(Grammar, self).__init__()
-        self.rules = rules
+        self.rules = frozenset(rules)
 
     def __eq__(self, other):
-        return set(self.rules) == set(other.rules)
+        return self.rules == other.rules
 
     def __str__(self):
         return '\n' + '\n'.join(sorted(repr(x) for x in self.rules)) + '\n'
@@ -111,11 +110,11 @@ class Parser(object):
     def __init__(self, rules, start):
         super(Parser, self).__init__()
         self.orig_rules = {rule.alias: rule for rule in rules}
-        rules = [self._ToRule(rule) for rule in rules]
-        self.grammar = ToCnf(Grammar(rules))
+        rules = [self._to_rule(rule) for rule in rules]
+        self.grammar = to_cnf(Grammar(rules))
         self.start = NT(start)
 
-    def _ToRule(self, lark_rule):
+    def _to_rule(self, lark_rule):
         """Converts a lark rule, (lhs, rhs, callback, options), to a Rule."""
         return Rule(
             NT(lark_rule.origin), [
@@ -129,26 +128,26 @@ class Parser(object):
         if all(r.lhs != self.start for r in table[(0, len(tokenized) - 1)]):
             raise ParseError('Parsing failed.')
         parse = trees[(0, len(tokenized) - 1)][NT(self.start)]
-        return self._ToTree(RevertCnf(parse))
+        return self._to_tree(revert_cnf(parse))
 
-    def _ToTree(self, rule_node):
+    def _to_tree(self, rule_node):
         """Converts a RuleNode parse tree to a lark Tree."""
         orig_rule = self.orig_rules[rule_node.rule.alias]
         children = []
         for i, child in enumerate(rule_node.children):
             if isinstance(child, RuleNode):
-                children.append(self._ToTree(child))
+                children.append(self._to_tree(child))
             else:
                 assert isinstance(child.s, Token)
                 children.append(child.s)
         return Tree(orig_rule.origin, children, rule=orig_rule)
 
 
-def PrintParse(node, indent=0):
+def print_parse(node, indent=0):
     if isinstance(node, RuleNode):
         print(' ' * (indent * 2) + str(node.rule.lhs))
         for child in node.children:
-            PrintParse(child, indent + 1)
+            print_parse(child, indent + 1)
     else:
         print(' ' * (indent * 2) + str(node.s))
 
@@ -247,7 +246,7 @@ class UnitSkipRule(Rule):
     __hash__ = Rule.__hash__
 
 
-def BuildUnitSkipRule(unit_rule, target_rule):
+def build_unit_skiprule(unit_rule, target_rule):
     skipped_rules = []
     if isinstance(unit_rule, UnitSkipRule):
         skipped_rules += unit_rule.skipped_rules
@@ -258,7 +257,7 @@ def BuildUnitSkipRule(unit_rule, target_rule):
                       weight=unit_rule.weight + target_rule.weight, alias=unit_rule.alias)
 
 
-def GetAnyNtUnitRule(g):
+def get_any_nt_unit_rule(g):
     """Returns a non-terminal unit rule from 'g', or None if there is none."""
     for rule in g.rules:
         if len(rule.rhs) == 1 and isinstance(rule.rhs[0], NT):
@@ -266,28 +265,25 @@ def GetAnyNtUnitRule(g):
     return None
 
 
-def RemoveUnitRule(g, rule):
+def _remove_unit_rule(g, rule):
     """Removes 'rule' from 'g' without changing the langugage produced by 'g'."""
     new_rules = [x for x in g.rules if x != rule]
     refs = [x for x in g.rules if x.lhs == rule.rhs[0]]
-    for ref in refs:
-        new_rules.append(BuildUnitSkipRule(rule, ref))
+    new_rules += [build_unit_skiprule(rule, ref) for ref in refs]
     return Grammar(new_rules)
 
 
-def Split(rule):
+def _split(rule):
     """Splits a rule whose len(rhs) > 2 into shorter rules."""
     rule_str = str(rule.lhs) + '__' + '_'.join(str(x) for x in rule.rhs)
     rule_name = '__SP_%s' % (rule_str) + '_%d'
-    new_rules = [Rule(rule.lhs, [rule.rhs[0], NT(rule_name % 1)], weight=rule.weight, alias=rule.alias)]
+    yield Rule(rule.lhs, [rule.rhs[0], NT(rule_name % 1)], weight=rule.weight, alias=rule.alias)
     for i in xrange(1, len(rule.rhs) - 2):
-        new_rules.append( Rule(NT(rule_name % i),
-                         [rule.rhs[i], NT(rule_name % (i + 1))], weight=0, alias='Split'))
-    new_rules.append(Rule(NT(rule_name % (len(rule.rhs) - 2)), rule.rhs[-2:], weight=0, alias='Split'))
-    return new_rules
+        yield Rule(NT(rule_name % i), [rule.rhs[i], NT(rule_name % (i + 1))], weight=0, alias='Split')
+    yield Rule(NT(rule_name % (len(rule.rhs) - 2)), rule.rhs[-2:], weight=0, alias='Split')
 
 
-def Term(g):
+def _term(g):
     """Applies the TERM rule on 'g' (see top comment)."""
     all_t = {x for rule in g.rules for x in rule.rhs if isinstance(x, T)}
     t_rules = {t: Rule(NT('__T_%s' % str(t)), [t], weight=0, alias='Term') for t in all_t}
@@ -302,46 +298,46 @@ def Term(g):
     return Grammar(new_rules)
 
 
-def Bin(g):
+def _bin(g):
     """Applies the BIN rule to 'g' (see top comment)."""
     new_rules = []
     for rule in g.rules:
         if len(rule.rhs) > 2:
-            new_rules.extend(Split(rule))
+            new_rules += _split(rule)
         else:
             new_rules.append(rule)
     return Grammar(new_rules)
 
 
-def Unit(g):
+def _unit(g):
     """Applies the UNIT rule to 'g' (see top comment)."""
-    nt_unit_rule = GetAnyNtUnitRule(g)
+    nt_unit_rule = get_any_nt_unit_rule(g)
     while nt_unit_rule:
-        g = RemoveUnitRule(g, nt_unit_rule)
-        nt_unit_rule = GetAnyNtUnitRule(g)
+        g = _remove_unit_rule(g, nt_unit_rule)
+        nt_unit_rule = get_any_nt_unit_rule(g)
     return g
 
 
-def ToCnf(g):
+def to_cnf(g):
     """Creates a CNF grammar from a general context-free grammar 'g'."""
-    g = Unit(Bin(Term(g)))
+    g = _unit(_bin(_term(g)))
     return CnfWrapper(g)
 
 
-def UnrollUnitSkipRule(lhs, orig_rhs, skipped_rules, children, weight, alias):
+def unroll_unit_skiprule(lhs, orig_rhs, skipped_rules, children, weight, alias):
     if not skipped_rules:
         return RuleNode(Rule(lhs, orig_rhs, weight=weight, alias=alias), children, weight=weight)
     else:
         weight = weight - skipped_rules[0].weight
         return RuleNode(
             Rule(lhs, [skipped_rules[0].lhs], weight=weight, alias=alias), [
-                UnrollUnitSkipRule(skipped_rules[0].lhs, orig_rhs,
+                unroll_unit_skiprule(skipped_rules[0].lhs, orig_rhs,
                                 skipped_rules[1:], children,
                                 skipped_rules[0].weight, skipped_rules[0].alias)
             ], weight=weight)
 
 
-def RevertCnf(node):
+def revert_cnf(node):
     """Reverts a parse tree (RuleNode) to its original non-CNF form (Node)."""
     if isinstance(node, T):
         return node
@@ -350,16 +346,15 @@ def RevertCnf(node):
         return node.children[0]
     else:
         children = []
-        reverted_children = [RevertCnf(x) for x in node.children]
-        for child in reverted_children:
+        for child in map(revert_cnf, node.children):
             # Reverts BIN rule.
             if isinstance(child, RuleNode) and child.rule.lhs.s.startswith('__SP_'):
-                children.extend(child.children)
+                children += child.children
             else:
                 children.append(child)
         # Reverts UNIT rule.
         if isinstance(node.rule, UnitSkipRule):
-            return UnrollUnitSkipRule(node.rule.lhs, node.rule.rhs,
+            return unroll_unit_skiprule(node.rule.lhs, node.rule.rhs,
                                     node.rule.skipped_rules, children,
                                     node.rule.weight, node.rule.alias)
         else:

From 16bfb25ddc795dcd6006e4882a3c9b83ae471552 Mon Sep 17 00:00:00 2001
From: ehudt <ehudt71@gmail.com>
Date: Wed, 24 Jan 2018 12:15:32 +0200
Subject: [PATCH 4/4] Update LICENSE

---
 LICENSE | 1 -
 1 file changed, 1 deletion(-)

diff --git a/LICENSE b/LICENSE
index 737149b..efcb966 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,5 +1,4 @@
 Copyright © 2017 Erez Shinan
-Copyright (c) 2018 Google LLC
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in