From b01c283d47d7194959d057bbaf18a25640acc92a Mon Sep 17 00:00:00 2001
From: julienmalard <julien.malard@mail.mcgill.ca>
Date: Tue, 10 Nov 2020 10:33:53 -0500
Subject: [PATCH 1/8] Failing test

---
 tests/test_reconstructor.py | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/tests/test_reconstructor.py b/tests/test_reconstructor.py
index 93c64fe..6196f4a 100644
--- a/tests/test_reconstructor.py
+++ b/tests/test_reconstructor.py
@@ -140,6 +140,43 @@ class TestReconstructor(TestCase):
         new_json = Reconstructor(json_parser).reconstruct(tree)
         self.assertEqual(json.loads(new_json), json.loads(test_json))
 
+    def test_switch_grammar_unicode_terminal(self):
+        """
+        This test checks that a parse tree built with a grammar containing only ascii characters can be reconstructed
+        with a grammar that has unicode rules (or vice versa). The original bug assigned ANON terminals to unicode
+        keywords, which offsets the ANON terminal count in the unicode grammar and causes subsequent identical ANON
+        tokens (e.g., `+=`) to mis-match between the two grammars.
+        """
+
+        g1 = """
+        start: (NL | stmt)*
+        stmt: "keyword" var op var
+        !op: ("+=" | "-=" | "*=" | "/=")
+        var: WORD
+        NL: /(\\r?\\n)+\s*/
+        """ + common
+
+        g2 = """
+        start: (NL | stmt)*
+        stmt: "குறிப்பு" var op var
+        !op: ("+=" | "-=" | "*=" | "/=")
+        var: WORD
+        NL: /(\\r?\\n)+\s*/
+        """ + common
+
+        code = """
+        keyword x += y
+        """
+
+        l1 = Lark(g1, parser='lalr')
+        l2 = Lark(g2, parser='lalr')
+        r = Reconstructor(l2)
+
+        tree = l1.parse(code)
+        code2 = r.reconstruct(tree)
+        assert l2.parse(code2) == tree
+
+
 
 if __name__ == '__main__':
     unittest.main()

From 74c94bb3695453e13ba540587e6d308b5f9de827 Mon Sep 17 00:00:00 2001
From: julienmalard <julien.malard@mail.mcgill.ca>
Date: Tue, 10 Nov 2020 10:34:06 -0500
Subject: [PATCH 2/8] Tests now pass!

---
 lark/load_grammar.py       |  6 +++---
 lark/reconstruct.py        |  7 ++-----
 lark/utils.py              | 12 ++++++++++++
 tests/test_nearley/nearley |  2 +-
 4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index eb0273c..dcf90dd 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -6,7 +6,7 @@ from copy import copy, deepcopy
 from io import open
 import pkgutil
 
-from .utils import bfs, eval_escaping, Py36, logger, classify_bool
+from .utils import bfs, eval_escaping, Py36, logger, classify_bool, isalnum, isalpha
 from .lexer import Token, TerminalDef, PatternStr, PatternRE
 
 from .parse_tree_builder import ParseTreeBuilder
@@ -328,9 +328,9 @@ class PrepareAnonTerminals(Transformer_InPlace):
                 try:
                     term_name = _TERMINAL_NAMES[value]
                 except KeyError:
-                    if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set:
+                    if isalnum(value) and isalpha(value[0]) and value.upper() not in self.term_set:
                         with suppress(UnicodeEncodeError):
-                            value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names
+                            value.upper().encode('utf8')  # Why shouldn't we have unicode in our terminal names?
                             term_name = value.upper()
 
                 if term_name in self.term_set:
diff --git a/lark/reconstruct.py b/lark/reconstruct.py
index e7cff31..614fb5e 100644
--- a/lark/reconstruct.py
+++ b/lark/reconstruct.py
@@ -8,6 +8,7 @@ from .lexer import Token, PatternStr
 from .grammar import Terminal, NonTerminal
 
 from .tree_matcher import TreeMatcher, is_discarded_terminal
+from .utils import isalnum
 
 def is_iter_empty(i):
     try:
@@ -56,10 +57,6 @@ class WriteTokensTransformer(Transformer_InPlace):
         return to_write
 
 
-def _isalnum(x):
-    # Categories defined here: https://www.python.org/dev/peps/pep-3131/
-    return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc']
-
 class Reconstructor(TreeMatcher):
     """
     A Reconstructor that will, given a full parse Tree, generate source code.
@@ -97,7 +94,7 @@ class Reconstructor(TreeMatcher):
         y = []
         prev_item = ''
         for item in x:
-            if prev_item and item and _isalnum(prev_item[-1]) and _isalnum(item[0]):
+            if prev_item and item and isalnum(prev_item[-1]) and isalnum(item[0]):
                 y.append(' ')
             y.append(item)
             prev_item = item
diff --git a/lark/utils.py b/lark/utils.py
index cfd4306..b0f0e22 100644
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -1,4 +1,5 @@
 import sys
+import unicodedata
 import os
 from functools import reduce
 from ast import literal_eval
@@ -12,6 +13,17 @@ logger.addHandler(logging.StreamHandler())
 # By default, we should not output any log messages
 logger.setLevel(logging.CRITICAL)
 
+def isalnum(x):
+    if len(x) != 1:
+        return all(isalnum(y) for y in x)
+    return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc']
+
+
+def isalpha(x):
+    if len(x) != 1:
+        return all(isalpha(y) for y in x)
+    return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc']
+
 
 def classify(seq, key=None, value=None):
     d = {}
diff --git a/tests/test_nearley/nearley b/tests/test_nearley/nearley
index a46b374..cf8925f 160000
--- a/tests/test_nearley/nearley
+++ b/tests/test_nearley/nearley
@@ -1 +1 @@
-Subproject commit a46b37471db486db0f6e1ce6a2934fb238346b44
+Subproject commit cf8925f729bde741a3076c5856c0c0862bc7f5de

From e5081420c68251cb740c7b5a1dd3118b5bf89cca Mon Sep 17 00:00:00 2001
From: julienmalard <julien.malard@mail.mcgill.ca>
Date: Tue, 10 Nov 2020 10:47:02 -0500
Subject: [PATCH 3/8] Add encoding specification for Python 2

---
 tests/test_reconstructor.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_reconstructor.py b/tests/test_reconstructor.py
index 6196f4a..45a6656 100644
--- a/tests/test_reconstructor.py
+++ b/tests/test_reconstructor.py
@@ -1,3 +1,5 @@
+# coding=utf-8
+
 import json
 import unittest
 from unittest import TestCase

From 02c39a84dba0f05aeeef8b0d84a6e98d507a8387 Mon Sep 17 00:00:00 2001
From: julienmalard <julien.malard@mail.mcgill.ca>
Date: Tue, 10 Nov 2020 11:52:37 -0500
Subject: [PATCH 4/8] Skipping this test for Python 2 for now.

---
 tests/test_reconstructor.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_reconstructor.py b/tests/test_reconstructor.py
index 45a6656..f132312 100644
--- a/tests/test_reconstructor.py
+++ b/tests/test_reconstructor.py
@@ -1,19 +1,22 @@
 # coding=utf-8
 
 import json
+import sys
 import unittest
 from unittest import TestCase
+
 from lark import Lark
 from lark.reconstruct import Reconstructor
 
-
 common = """
 %import common (WS_INLINE, NUMBER, WORD)
 %ignore WS_INLINE
 """
 
+
 def _remove_ws(s):
-    return s.replace(' ', '').replace('\n','')
+    return s.replace(' ', '').replace('\n', '')
+
 
 class TestReconstructor(TestCase):
 
@@ -24,7 +27,6 @@ class TestReconstructor(TestCase):
         self.assertEqual(_remove_ws(code), _remove_ws(new))
 
     def test_starred_rule(self):
-
         g = """
         start: item*
         item: NL
@@ -40,7 +42,6 @@ class TestReconstructor(TestCase):
         self.assert_reconstruct(g, code)
 
     def test_starred_group(self):
-
         g = """
         start: (rule | NL)*
         rule: WORD ":" NUMBER
@@ -54,7 +55,6 @@ class TestReconstructor(TestCase):
         self.assert_reconstruct(g, code)
 
     def test_alias(self):
-
         g = """
         start: line*
         line: NL
@@ -142,6 +142,7 @@ class TestReconstructor(TestCase):
         new_json = Reconstructor(json_parser).reconstruct(tree)
         self.assertEqual(json.loads(new_json), json.loads(test_json))
 
+    @unittest.skipIf(sys.version_info < (3, 0), "Python 2 does not play well with Unicode.")
     def test_switch_grammar_unicode_terminal(self):
         """
         This test checks that a parse tree built with a grammar containing only ascii characters can be reconstructed
@@ -179,6 +180,5 @@ class TestReconstructor(TestCase):
         assert l2.parse(code2) == tree
 
 
-
 if __name__ == '__main__':
     unittest.main()

From 364f9ae3a567883cc0fd1b114fa443dd690239f6 Mon Sep 17 00:00:00 2001
From: julienmalard <julien.malard@mail.mcgill.ca>
Date: Tue, 10 Nov 2020 14:28:01 -0500
Subject: [PATCH 5/8] Response to code review

---
 lark/load_grammar.py | 8 +++-----
 lark/reconstruct.py  | 4 ++--
 lark/utils.py        | 8 ++++++--
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index dcf90dd..e45053e 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -6,7 +6,7 @@ from copy import copy, deepcopy
 from io import open
 import pkgutil
 
-from .utils import bfs, eval_escaping, Py36, logger, classify_bool, isalnum, isalpha
+from .utils import bfs, eval_escaping, Py36, logger, classify_bool, is_id_continue, isalpha
 from .lexer import Token, TerminalDef, PatternStr, PatternRE
 
 from .parse_tree_builder import ParseTreeBuilder
@@ -328,10 +328,8 @@ class PrepareAnonTerminals(Transformer_InPlace):
                 try:
                     term_name = _TERMINAL_NAMES[value]
                 except KeyError:
-                    if isalnum(value) and isalpha(value[0]) and value.upper() not in self.term_set:
-                        with suppress(UnicodeEncodeError):
-                            value.upper().encode('utf8')  # Why shouldn't we have unicode in our terminal names?
-                            term_name = value.upper()
+                    if is_id_continue(value) and isalpha(value[0]) and value.upper() not in self.term_set:
+                        term_name = value.upper()
 
                 if term_name in self.term_set:
                     term_name = None
diff --git a/lark/reconstruct.py b/lark/reconstruct.py
index 614fb5e..2efc0ae 100644
--- a/lark/reconstruct.py
+++ b/lark/reconstruct.py
@@ -8,7 +8,7 @@ from .lexer import Token, PatternStr
 from .grammar import Terminal, NonTerminal
 
 from .tree_matcher import TreeMatcher, is_discarded_terminal
-from .utils import isalnum
+from .utils import is_id_continue
 
 def is_iter_empty(i):
     try:
@@ -94,7 +94,7 @@ class Reconstructor(TreeMatcher):
         y = []
         prev_item = ''
         for item in x:
-            if prev_item and item and isalnum(prev_item[-1]) and isalnum(item[0]):
+            if prev_item and item and is_id_continue(prev_item[-1]) and is_id_continue(item[0]):
                 y.append(' ')
             y.append(item)
             prev_item = item
diff --git a/lark/utils.py b/lark/utils.py
index b0f0e22..498a12a 100644
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -13,9 +13,13 @@ logger.addHandler(logging.StreamHandler())
 # By default, we should not output any log messages
 logger.setLevel(logging.CRITICAL)
 
-def isalnum(x):
+def is_id_continue(x):
+    """
+    Checks if all characters in `x` are alphanumeric characters (Unicode standard, so diactrics, Indian vowels, non-latin
+    numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier.
+    """
     if len(x) != 1:
-        return all(isalnum(y) for y in x)
+        return all(is_id_continue(y) for y in x)
     return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc']
 
 

From 1b2fc2bda44e67ea3a57ea010d8ededbdb593af9 Mon Sep 17 00:00:00 2001
From: julienmalard <julien.malard@mail.mcgill.ca>
Date: Tue, 10 Nov 2020 16:05:32 -0500
Subject: [PATCH 6/8] Code review #2

---
 lark/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lark/utils.py b/lark/utils.py
index 498a12a..29fa514 100644
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -16,7 +16,7 @@ logger.setLevel(logging.CRITICAL)
 def is_id_continue(x):
     """
     Checks if all characters in `x` are alphanumeric characters (Unicode standard, so diactrics, Indian vowels, non-latin
-    numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier.
+    numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier. See PEP 3131 for details.
     """
     if len(x) != 1:
         return all(is_id_continue(y) for y in x)
@@ -24,6 +24,7 @@ def is_id_continue(x):
 
 
 def isalpha(x):
+    """See PEP 3131 for details."""
     if len(x) != 1:
         return all(isalpha(y) for y in x)
     return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc']

From f8b0ca3ccc1f27bcaa9e7dedf6ae7374d677ac4f Mon Sep 17 00:00:00 2001
From: julienmalard <julien.malard@mail.mcgill.ca>
Date: Tue, 10 Nov 2020 17:57:00 -0500
Subject: [PATCH 7/8] Code review 3

---
 lark/load_grammar.py |  4 ++--
 lark/utils.py        | 13 ++++++++-----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index e45053e..ee38dc8 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -6,7 +6,7 @@ from copy import copy, deepcopy
 from io import open
 import pkgutil
 
-from .utils import bfs, eval_escaping, Py36, logger, classify_bool, is_id_continue, isalpha
+from .utils import bfs, eval_escaping, Py36, logger, classify_bool, is_id_continue, is_id_start
 from .lexer import Token, TerminalDef, PatternStr, PatternRE
 
 from .parse_tree_builder import ParseTreeBuilder
@@ -328,7 +328,7 @@ class PrepareAnonTerminals(Transformer_InPlace):
                 try:
                     term_name = _TERMINAL_NAMES[value]
                 except KeyError:
-                    if is_id_continue(value) and isalpha(value[0]) and value.upper() not in self.term_set:
+                    if is_id_continue(value) and is_id_start(value[0]) and value.upper() not in self.term_set:
                         term_name = value.upper()
 
                 if term_name in self.term_set:
diff --git a/lark/utils.py b/lark/utils.py
index 29fa514..89498c6 100644
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -20,14 +20,17 @@ def is_id_continue(x):
     """
     if len(x) != 1:
         return all(is_id_continue(y) for y in x)
-    return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc']
+    return x == '_' or unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc']
 
 
-def isalpha(x):
-    """See PEP 3131 for details."""
+def is_id_start(x):
+    """
+    Checks if all characters in `x` are alphabetic characters (Unicode standard, so diactrics, Indian vowels, non-latin
+    numbers, etc. all pass). Synonymous with a Python `ID_START` identifier. See PEP 3131 for details.
+    """
     if len(x) != 1:
-        return all(isalpha(y) for y in x)
-    return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc']
+        return all(is_id_start(y) for y in x)
+    return x == '_' or unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc']
 
 
 def classify(seq, key=None, value=None):

From 89538498411f7675965ba248500d771f8a492fdd Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Sun, 29 Nov 2020 21:45:59 +0200
Subject: [PATCH 8/8] Allow unicode in terminal names

---
 lark/load_grammar.py       | 4 +---
 tests/test_nearley/nearley | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index af131c1..d663215 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -333,9 +333,7 @@ class PrepareAnonTerminals(Transformer_InPlace):
                     term_name = _TERMINAL_NAMES[value]
                 except KeyError:
                     if is_id_continue(value) and is_id_start(value[0]) and value.upper() not in self.term_set:
-                        with suppress(UnicodeEncodeError):
-                            value.upper().encode('ascii')  # Make sure we don't have unicode in our terminal names
-                            term_name = value.upper()
+                        term_name = value.upper()
 
                 if term_name in self.term_set:
                     term_name = None
diff --git a/tests/test_nearley/nearley b/tests/test_nearley/nearley
index cf8925f..a46b374 160000
--- a/tests/test_nearley/nearley
+++ b/tests/test_nearley/nearley
@@ -1 +1 @@
-Subproject commit cf8925f729bde741a3076c5856c0c0862bc7f5de
+Subproject commit a46b37471db486db0f6e1ce6a2934fb238346b44