diff --git a/lark/grammar.py b/lark/grammar.py
index 14893fb..f90cce4 100644
--- a/lark/grammar.py
+++ b/lark/grammar.py
@@ -3,10 +3,13 @@ from .utils import Serialize
 ###{standalone
 
 class Symbol(Serialize):
+    __slots__ = ('name', '_hash')
+
     is_term = NotImplemented
 
     def __init__(self, name):
         self.name = name
+        self._hash = hash(self.name)
 
     def __eq__(self, other):
         assert isinstance(other, Symbol), other
@@ -16,7 +19,7 @@ class Symbol(Serialize):
         return not (self == other)
 
     def __hash__(self):
-        return hash(self.name)
+        return self._hash
 
     def __repr__(self):
         return '%s(%r)' % (type(self).__name__, self.name)
@@ -31,6 +34,7 @@ class Terminal(Symbol):
 
     def __init__(self, name, filter_out=False):
         self.name = name
+        self._hash = hash(self.name)
         self.filter_out = filter_out
 
     @property
@@ -69,7 +73,7 @@ class Rule(Serialize):
         expansion : a list of symbols
         order : index of this expansion amongst all rules of the same name
     """
-    __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash')
+    __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash', '_rp')
 
     __serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options'
     __serialize_namespace__ = Terminal, NonTerminal, RuleOptions
@@ -81,6 +85,7 @@ class Rule(Serialize):
         self.order = order
         self.options = options
         self._hash = hash((self.origin, tuple(self.expansion)))
+        self._rp = None
 
     def _deserialize(self):
         self._hash = hash((self.origin, tuple(self.expansion)))
@@ -101,4 +106,4 @@ class Rule(Serialize):
 
 
 
-###}
\ No newline at end of file
+###}
diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py
index 4085ea5..71a7bc5 100644
--- a/lark/parsers/grammar_analysis.py
+++ b/lark/parsers/grammar_analysis.py
@@ -1,18 +1,41 @@
-from collections import Counter
+from collections import Counter, defaultdict
 
 from ..utils import bfs, fzset, classify
 from ..exceptions import GrammarError
 from ..grammar import Rule, Terminal, NonTerminal
+import time
 
+t_firsts = 0
+t_xy = 0
+t_call = 0
+cache_hits = 0
+cache_misses = 0
+
+# used to be just a tuple (rp, la)
+# but by making it an object,
+# the hash and equality become trivial
+# (slightly faster for sets which are hashtables?)
+class RulePtrLookahead(object):
+    __slots__ = 'rp', 'la'
+
+    def __init__(self, rp, la):
+        self.rp = rp
+        self.la = la
 
 class RulePtr(object):
-    __slots__ = ('rule', 'index')
+    __slots__ = ('rule', 'index', '_advance', '_lookaheads', '_next_rules_by_origin', '_first')
 
     def __init__(self, rule, index):
         assert isinstance(rule, Rule)
         assert index <= len(rule.expansion)
         self.rule = rule
         self.index = index
+        #self._hash = hash((self.rule, self.index))
+        #self._hash = None
+        self._advance = None
+        self._lookaheads = {}
+        self._next_rules_by_origin = None
+        self._first = None
 
     def __repr__(self):
         before = [x.name for x in self.rule.expansion[:self.index]]
@@ -23,32 +46,102 @@ class RulePtr(object):
     def next(self):
         return self.rule.expansion[self.index]
 
+    # don't create duplicate RulePtrs
     def advance(self, sym):
         assert self.next == sym
-        return RulePtr(self.rule, self.index+1)
+        a = self._advance
+        if a is None:
+            a = RulePtr(self.rule, self.index + 1)
+            self._advance = a
+        return a
 
     @property
     def is_satisfied(self):
         return self.index == len(self.rule.expansion)
 
+    def lookahead(self, la):
+        rp_la = self._lookaheads.get(la, None)
+        if rp_la is None:
+            rp_la = RulePtrLookahead(self, la)
+            self._lookaheads[la] = rp_la
+        return rp_la
+
+    def next_rules_by_origin(self, rules_by_origin):
+        n = self._next_rules_by_origin
+        if n is None:
+            n = rules_by_origin[self.next]
+            self._next_rules_by_origin = n
+        return n
+
+    # recursive form of lalr_analyis.py:343 (which is easier to understand IMO)
+    # normally avoid recursion but this allows us to cache
+    # each intermediate step in a corresponding RulePtr
+    def first(self, i, firsts, nullable, t):
+        global cache_hits
+        global cache_misses
+        global t_firsts
+        global t_xy
+        global t_call
+        t_call += time.time() - t
+        n = len(self.rule.expansion)
+        if i == n:
+            return ([], True)
+        x = self._first
+        t_x = time.time()
+        if x is None:
+            t0 = time.time()
+            t_y = time.time()
+            cache_misses += 1
+            s = self.rule.expansion[i]
+            l = list(firsts.get(s, []))
+            b = (s in nullable)
+            if b:
+                t1 = time.time()
+                t_firsts += t1 - t0
+                l_b_2 = self.advance(s).first(i + 1, firsts, nullable, time.time())
+                #l_b_2 = first(self.advance(self.next), i + 1, firsts, nullable, time.time())
+                t0 = time.time()
+                l.extend(l_b_2[0])
+                b = l_b_2[1]
+            x = (l, b)
+            self._first = x
+            t1 = time.time()
+            t_firsts += t1 - t0
+        else:
+            t_y = time.time()
+            cache_hits += 1
+        t_xy += t_y - t_x
+        return x
+
+    # optimizations were made so that there should never be
+    # two distinct equal RulePtrs
+    # should help set/hashtable lookups?
+    '''
     def __eq__(self, other):
         return self.rule == other.rule and self.index == other.index
     def __hash__(self):
-        return hash((self.rule, self.index))
+        return self._hash
+    '''
+
 
 class LR0ItemSet(object):
-    __slots__ = ('kernel', 'closure', 'transitions')
+    __slots__ = ('kernel', 'closure', 'transitions', 'lookaheads', '_hash')
 
     def __init__(self, kernel, closure):
         self.kernel = fzset(kernel)
         self.closure = fzset(closure)
         self.transitions = {}
+        self.lookaheads = defaultdict(set)
+        #self._hash = hash(self.kernel)
 
+    # state generation ensures no duplicate LR0ItemSets
+    '''
     def __eq__(self, other):
         return self.kernel == other.kernel
 
     def __hash__(self):
-        return hash(self.kernel)
+        return self._hash
+    '''
 
     def __repr__(self):
         return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure]))
@@ -153,14 +246,22 @@ class GrammarAnalyzer(object):
                 for start in parser_conf.start}
 
         lr0_rules = parser_conf.rules + list(lr0_root_rules.values())
+        assert(len(lr0_rules) == len(set(lr0_rules)))
 
         self.lr0_rules_by_origin = classify(lr0_rules, lambda r: r.origin)
 
-        self.lr0_start_states = {start: LR0ItemSet([RulePtr(root_rule, 0)], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin))
+        # cache RulePtr(r, 0) in r (no duplicate RulePtr objects)
+        for root_rule in lr0_root_rules.values():
+            root_rule._rp = RulePtr(root_rule, 0)
+        self.lr0_start_states = {start: LR0ItemSet([root_rule._rp], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin))
                 for start, root_rule in lr0_root_rules.items()}
 
         self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules)
 
+        # unused, did not help
+        self.lr1_cache = {}
+        self.lr1_cache2 = {}
+
     def expand_rule(self, source_rule, rules_by_origin=None):
         "Returns all init_ptrs accessible by rule (recursive)"
 
@@ -172,7 +273,11 @@ class GrammarAnalyzer(object):
             assert not rule.is_term, rule
 
             for r in rules_by_origin[rule]:
-                init_ptr = RulePtr(r, 0)
+                # don't create duplicate RulePtr objects
+                init_ptr = r._rp
+                if init_ptr is None:
+                    init_ptr = RulePtr(r, 0)
+                    r._rp = init_ptr
                 init_ptrs.add(init_ptr)
 
                 if r.expansion: # if not empty rule
diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py
index 61fe692..eb87e7a 100644
--- a/lark/parsers/lalr_analysis.py
+++ b/lark/parsers/lalr_analysis.py
@@ -7,13 +7,16 @@ For now, shift/reduce conflicts are automatically resolved as shifts.
 # Email : erezshin@gmail.com
 
 import logging
-from collections import defaultdict
+from collections import defaultdict, deque
 
 from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator
 from ..exceptions import GrammarError
 
 from .grammar_analysis import GrammarAnalyzer, Terminal, RulePtr, LR0ItemSet
 from ..grammar import Rule
+from . import grammar_analysis
+
+import time
 
 ###{standalone
 
@@ -28,6 +31,16 @@ class Action:
 Shift = Action('Shift')
 Reduce = Action('Reduce')
 
+t_set_0 = 0
+t_set_1 = 0
+t_expand = 0
+t_rules = 0
+t_append = 0
+t_z = 0
+t_begin = 0
+t_count = 0
+t_call = 0
+
 class ParseTable:
     def __init__(self, states, start_states, end_states):
         self.states = states
@@ -86,20 +99,24 @@ class LALR_Analyzer(GrammarAnalyzer):
 
     def generate_lr0_states(self):
         self.states = set()
+        # map of kernels to LR0ItemSets
+        cache = {}
 
         def step(state):
             _, unsat = classify_bool(state.closure, lambda rp: rp.is_satisfied)
 
             d = classify(unsat, lambda rp: rp.next)
             for sym, rps in d.items():
-                kernel = {rp.advance(sym) for rp in rps}
-                closure = set(kernel)
+                kernel = fzset({rp.advance(sym) for rp in rps})
+                new_state = cache.get(kernel, None)
+                if new_state is None:
+                    closure = set(kernel)
+                    for rp in kernel:
+                        if not rp.is_satisfied and not rp.next.is_term:
+                            closure |= self.expand_rule(rp.next, self.lr0_rules_by_origin)
+                    new_state = LR0ItemSet(kernel, closure)
+                    cache[kernel] = new_state
 
-                for rp in kernel:
-                    if not rp.is_satisfied and not rp.next.is_term:
-                        closure |= self.expand_rule(rp.next, self.lr0_rules_by_origin)
-
-                new_state = LR0ItemSet(kernel, closure)
                 state.transitions[sym] = new_state
                 yield new_state
 
@@ -109,36 +126,59 @@ class LALR_Analyzer(GrammarAnalyzer):
             pass
 
     def discover_lookaheads(self):
+        # lookaheads is now a member of LR0ItemSet, so don't need to look up a dictionary here
         # state -> rule -> set of lookaheads
-        self.lookaheads = defaultdict(lambda: defaultdict(set))
+        #self.lookaheads = defaultdict(lambda: defaultdict(set))
         # state -> rule -> list of (set of lookaheads) to propagate to
-        self.propagates = defaultdict(lambda: defaultdict(list))
+        #self.propagates = defaultdict(lambda: defaultdict(list))
+        self.propagates = {}
+
+        t0 = time.time()
 
+        t = Terminal('$END')
         for s in self.lr0_start_states.values():
             for rp in s.kernel:
-                self.lookaheads[s][rp].add(Terminal('$END'))
+                #self.lookaheads[s][rp].add(Terminal('$END'))
+                s.lookaheads[rp].add(t)
+
+        t_closure = 0
 
         # There is a 1 to 1 correspondance between LR0 and LALR1 states.
         # We calculate the lookaheads for LALR1 kernel items from the LR0 kernel items.
         # use a terminal that does not exist in the grammar
         t = Terminal('$#')
         for s in self.states:
+            p = {}
+            self.propagates[s] = p
             for rp in s.kernel:
-                for rp2, la in self.generate_lr1_closure([(rp, t)]):
+                q = []
+                p[rp] = q
+                t2 = time.time()
+                z = self.generate_lr1_closure([rp.lookahead(t)], time.time())
+                t3 = time.time()
+                t_closure += t3 - t2
+                #for rp2, la in self.generate_lr1_closure([(rp, t)], time.time()):
+                for rp2_la in z:
+                    rp2 = rp2_la.rp
+                    la = rp2_la.la
                     if rp2.is_satisfied:
                         continue
                     next_symbol = rp2.next
                     next_state = s.transitions[next_symbol]
                     rp3 = rp2.advance(next_symbol)
                     assert(rp3 in next_state.kernel)
-                    x = self.lookaheads[next_state][rp3]
+                    #x = self.lookaheads[next_state][rp3]
+                    x = next_state.lookaheads[rp3]
                     if la == t:
                         # we must propagate rp's lookaheads to rp3's lookahead set
-                        self.propagates[s][rp].append(x)
+                        q.append(x)
                     else:
                         # this lookahead is "generated spontaneously" for rp3
                         x.add(la)
 
+        t1 = time.time()
+        print('Discovering took {:.3f} (generating closure), {:.3f} (total)'.format(t_closure, t1 - t0))
+
     def propagate_lookaheads(self):
         changed = True
         while changed:
@@ -146,7 +186,8 @@ class LALR_Analyzer(GrammarAnalyzer):
             for s in self.states:
                 for rp in s.kernel:
                     # from (from is a keyword)
-                    f = self.lookaheads[s][rp]
+                    #f = self.lookaheads[s][rp]
+                    f = s.lookaheads[rp]
                     # to
                     t = self.propagates[s][rp]
                     for x in t:
@@ -155,20 +196,33 @@ class LALR_Analyzer(GrammarAnalyzer):
                         changed = changed or (len(x) != old)
 
     def generate_lalr1_states(self):
+        t0 = time.time()
         # 1 to 1 correspondance between LR0 and LALR1 states
         # We must fetch the lookaheads we calculated,
         # to create the LALR1 kernels from the LR0 kernels.
         # Then, we generate the LALR1 states by taking the LR1 closure of the new kernel items.
         # map of LR0 states to LALR1 states
         m = {}
+        t_closure = 0
+        z = 0
         for s in self.states:
+            z = max(z, len(s.closure))
             kernel = []
             for rp in s.kernel:
-                las = self.lookaheads[s][rp]
+                #las = self.lookaheads[s][rp]
+                las = s.lookaheads[rp]
                 assert(len(las) > 0)
                 for la in las:
-                    kernel.append((rp, la))
-            m[s] = self.generate_lr1_closure(kernel)
+                    kernel.append(rp.lookahead(la))
+            t0_0 = time.time()
+            m[s] = self.generate_lr1_closure(kernel, time.time())
+            t0_1 = time.time()
+            t_closure += t0_1 - t0_0
+
+        print('Generating lalr1 closure for lalr kernels took {:.3f}'.format(t_closure))
+        print('Max lr0 state size was {}'.format(z))
+
+        t1 = time.time()
 
         self.states = {}
         for s, v in m.items():
@@ -176,8 +230,8 @@ class LALR_Analyzer(GrammarAnalyzer):
             for la, next_state in s.transitions.items():
                 actions[la] = (Shift, next_state.closure)
 
-            sat, _ = classify_bool(v, lambda x: x[0].is_satisfied)
-            reductions = classify(sat, lambda x: x[1], lambda x: x[0])
+            sat, _ = classify_bool(v, lambda x: x.rp.is_satisfied)
+            reductions = classify(sat, lambda x: x.la, lambda x: x.rp)
             for la, rps in reductions.items():
                 if len(rps) > 1:
                     raise GrammarError("Collision in %s: %s" % (la, ', '.join([ str(r.rule) for r in rps ])))
@@ -190,6 +244,8 @@ class LALR_Analyzer(GrammarAnalyzer):
 
             self.states[s.closure] = {k.name: v for k, v in actions.items()}
 
+        t2 = time.time()
+
         end_states = {}
         for s in self.states:
             for rp in s:
@@ -198,44 +254,168 @@ class LALR_Analyzer(GrammarAnalyzer):
                         assert(not start in end_states)
                         end_states[start] = s
 
+        t3 = time.time()
+
         self._parse_table = ParseTable(self.states, {start: state.closure for start, state in self.lr0_start_states.items()}, end_states)
 
+        t4 = time.time()
+
         if self.debug:
             self.parse_table = self._parse_table
         else:
             self.parse_table = IntParseTable.from_ParseTable(self._parse_table)
 
-    def generate_lr1_closure(self, kernel):
+        t5 = time.time()
+
+        print(('Generating lalr1 states took ' + ', '.join([ '{:.3f}' ] * 5)).format(t1 - t0, t2 - t1, t3 - t2, t4 - t3, t5 - t4))
+        print('Generating firsts took {:.3f} (time actually calculating), {:.3f} (end to end), {:.3f} (just function call)'.format(grammar_analysis.t_firsts, grammar_analysis.t_xy, grammar_analysis.t_call))
+
+    def generate_lr1_closure(self, kernel, t_caller):
+        global t_call
+        global t_set_0
+        global t_set_1
+        global t_expand
+        global t_rules
+        global t_append
+        global t_z
+        global t_begin
+        global t_count
+
+        t_start = time.time()
+        t_call += t_start - t_caller
+
+        # cache the results of this function
+        # not many hits, no noticeable performance improvement
+        '''
+        k = fzset(kernel)
+        cached = self.lr1_cache.get(k, None)
+        if not cached is None:
+            return cached
+        '''
+
         closure = set()
+        closure_hash = {}
+
+        y = 0
 
         q = list(kernel)
         while len(q) > 0:
-            rp, la = q.pop()
-            if (rp, la) in closure:
+            t_a = time.time()
+            rp_la = q.pop()
+            #rp_la_hash = hash(rp_la)
+            t0 = time.time()
+            t_begin += t0 - t_a
+            # try to manually maintain hashtable,
+            # as a set of just hashes (ints) was notably faster
+            '''
+            if rp_la_hash in closure_hash:
+                if rp_la in closure_hash[rp_la_hash]:
+                    t0_0 = time.time()
+                    t_set_0 += t0_0 - t0
+                    continue
+                t0_0 = time.time()
+                t_set_0 += t0_0 - t0
+            else:
+                closure_hash[rp_la_hash] = []
+            '''
+            if rp_la in closure:
+                t0_0 = time.time()
+                t_set_0 += t0_0 - t0
                 continue
-            closure.add((rp, la))
+            t0_0 = time.time()
+            closure.add(rp_la)
+            #closure_hash[rp_la_hash].append(rp_la)
+            t1 = time.time()
+            t_set_0 += t0_0 - t0
+            t_set_1 += t1 - t0_0
+            rp = rp_la.rp
+            la = rp_la.la
 
             if rp.is_satisfied:
                 continue
             if rp.next.is_term:
                 continue
 
+            t2 = time.time()
+
+            # cache these calculations inside each RulePtr
+            # see grammar_analysis.py:79
             l = []
+            '''
             i = rp.index + 1
             n = len(rp.rule.expansion)
-            while i < n:
-                s = rp.rule.expansion[i]
-                l.extend(self.FIRST.get(s, []))
-                if not s in self.NULLABLE:
-                    break
-                i += 1
-
+            l2_i = self.lr1_cache2.get((rp.rule, i), None)
+            l2 = []
+            if l2_i is None:
+                while i < n:
+                    s = rp.rule.expansion[i]
+                    l2.extend(self.FIRST.get(s, []))
+                    if not s in self.NULLABLE:
+                        break
+                    i += 1
+                self.lr1_cache2[(rp.rule, i)] = (l2, i)
+            else:
+                l2 = l2_i[0]
+                i = l2_i[1]
+
+            l.extend(l2)
+            '''
+            # this function call seems really slow (see grammar_analysis.t_call above)
+            # tried making it not a method call so don't need to look up vtable
+            # still equally slow
+            l2, nullable = rp.first(rp.index + 1, self.FIRST, self.NULLABLE, time.time())
+            #l2, nullable = grammar_analysis.first(rp, rp.index + 1, self.FIRST, self.NULLABLE, time.time())
+            #l.extend(l2)
+            l = l2
+            t3 = time.time()
+
+            t_expand += t3 - t2
+
+            # if we don't modify l2 and add an extra check in the loop below,
+            # we don't have to copy it
             # if all of rp.rule.expansion[rp.index + 1:] were nullable:
-            if i == n:
-                l.append(la)
+            #if nullable:
+            #    l.append(la)
+
+            t4 = time.time()
+            x = rp.next_rules_by_origin(self.lr0_rules_by_origin)
+            t5 = time.time()
 
-            for r in self.lr0_rules_by_origin[rp.next]:
+            # usually between 20-60? seen as high as ~175
+            y = max(y, len(x) * len(l))
+            #print('adding {} * {} rules to closure max {}'.format(len(x), len(l), y))
+            for r in x:
                 for s in l:
-                    q.append((RulePtr(r, 0), s))
+                    # cache RulePtr(r, 0) in r (no duplicate RulePtr objects)
+                    # cache r._rp in _rp (1 less object property lookup?)
+                    _rp = r._rp
+                    if _rp is None:
+                        _rp = RulePtr(r, 0)
+                        r._rp = _rp
+                    q.append(_rp.lookahead(s))
+                    #q.append((r._rp, s))
+                if nullable:
+                    _rp = r._rp
+                    if _rp is None:
+                        _rp = RulePtr(r, 0)
+                        r._rp = _rp
+                    q.append(_rp.lookahead(la))
+                    #q.append((r._rp, la))
+
+            t6 = time.time()
+            t_rules += t5 - t4
+            t_append += t6 - t5
+
+        #self.lr1_cache[k] = closure
+
+        t_end = time.time()
+        t_z += t_end - t_start
+
+        t_count += 1
+
+        if t_count % 1000 == 0:
+            print('\tGenerating lr1 closure took begin {:.3f}, set contains {:.3f}, set add {:.3f}, get first {:.3f}'.format(t_begin, t_set_0, t_set_1, t_expand))
+            print('\tget next rules {:.3f}, append rules {:.3f}, total {:.3f}, call time {:.3f}, count {}'.format(t_rules, t_append, t_z, t_call, t_count))
+            print('\tmax number of appends {}'.format(y))
 
         return closure
diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py
index 6eb3839..b3985ae 100644
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -8,6 +8,8 @@ from ..utils import Enumerator, Serialize
 
 from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
 
+import time
+
 
 ###{standalone
 class LALR_Parser(object):
@@ -15,10 +17,20 @@ class LALR_Parser(object):
         assert all(r.options is None or r.options.priority is None
                    for r in parser_conf.rules), "LALR doesn't yet support prioritization"
         analysis = LALR_Analyzer(parser_conf, debug=debug)
+        t0 = time.time()
         analysis.generate_lr0_states()
+        t1 = time.time()
         analysis.discover_lookaheads()
+        t2 = time.time()
         analysis.propagate_lookaheads()
+        t3 = time.time()
         analysis.generate_lalr1_states()
+        t4 = time.time()
+        print('Generating lr0 states took {:.3f}'.format(t1 - t0))
+        print('Discovering lookaheads took {:.3f}'.format(t2 - t1))
+        print('Propagating lookaheads took took {:.3f}'.format(t3 - t2))
+        print('Generating lalr states (closure) took {:.3f}'.format(t4 - t3))
+        print('-' * 32)
         callbacks = parser_conf.callbacks
 
         self._parse_table = analysis.parse_table