From f8b0ca3ccc1f27bcaa9e7dedf6ae7374d677ac4f Mon Sep 17 00:00:00 2001
From: julienmalard <julien.malard@mail.mcgill.ca>
Date: Tue, 10 Nov 2020 17:57:00 -0500
Subject: [PATCH] Code review 3

---
 lark/load_grammar.py |  4 ++--
 lark/utils.py        | 13 ++++++++-----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index e45053e..ee38dc8 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -6,7 +6,7 @@ from copy import copy, deepcopy
 from io import open
 import pkgutil
 
-from .utils import bfs, eval_escaping, Py36, logger, classify_bool, is_id_continue, isalpha
+from .utils import bfs, eval_escaping, Py36, logger, classify_bool, is_id_continue, is_id_start
 from .lexer import Token, TerminalDef, PatternStr, PatternRE
 
 from .parse_tree_builder import ParseTreeBuilder
@@ -328,7 +328,7 @@ class PrepareAnonTerminals(Transformer_InPlace):
                 try:
                     term_name = _TERMINAL_NAMES[value]
                 except KeyError:
-                    if is_id_continue(value) and isalpha(value[0]) and value.upper() not in self.term_set:
+                    if is_id_continue(value) and is_id_start(value[0]) and value.upper() not in self.term_set:
                         term_name = value.upper()
 
                 if term_name in self.term_set:
diff --git a/lark/utils.py b/lark/utils.py
index 29fa514..89498c6 100644
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -20,14 +20,17 @@ def is_id_continue(x):
     """
     if len(x) != 1:
         return all(is_id_continue(y) for y in x)
-    return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc']
+    return x == '_' or unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc']
 
 
-def isalpha(x):
-    """See PEP 3131 for details."""
+def is_id_start(x):
+    """
+    Checks if all characters in `x` are alphabetic characters (Unicode standard, so diactrics, Indian vowels, non-latin
+    numbers, etc. all pass). Synonymous with a Python `ID_START` identifier. See PEP 3131 for details.
+    """
     if len(x) != 1:
-        return all(isalpha(y) for y in x)
-    return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc']
+        return all(is_id_start(y) for y in x)
+    return x == '_' or unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc']
 
 
 def classify(seq, key=None, value=None):