From 364f9ae3a567883cc0fd1b114fa443dd690239f6 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Tue, 10 Nov 2020 14:28:01 -0500 Subject: [PATCH] Response to code review --- lark/load_grammar.py | 8 +++----- lark/reconstruct.py | 4 ++-- lark/utils.py | 8 ++++++-- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index dcf90dd..e45053e 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -6,7 +6,7 @@ from copy import copy, deepcopy from io import open import pkgutil -from .utils import bfs, eval_escaping, Py36, logger, classify_bool, isalnum, isalpha +from .utils import bfs, eval_escaping, Py36, logger, classify_bool, is_id_continue, isalpha from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder @@ -328,10 +328,8 @@ class PrepareAnonTerminals(Transformer_InPlace): try: term_name = _TERMINAL_NAMES[value] except KeyError: - if isalnum(value) and isalpha(value[0]) and value.upper() not in self.term_set: - with suppress(UnicodeEncodeError): - value.upper().encode('utf8') # Why shouldn't we have unicode in our terminal names? - term_name = value.upper() + if is_id_continue(value) and isalpha(value[0]) and value.upper() not in self.term_set: + term_name = value.upper() if term_name in self.term_set: term_name = None diff --git a/lark/reconstruct.py b/lark/reconstruct.py index 614fb5e..2efc0ae 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -8,7 +8,7 @@ from .lexer import Token, PatternStr from .grammar import Terminal, NonTerminal from .tree_matcher import TreeMatcher, is_discarded_terminal -from .utils import isalnum +from .utils import is_id_continue def is_iter_empty(i): try: @@ -94,7 +94,7 @@ class Reconstructor(TreeMatcher): y = [] prev_item = '' for item in x: - if prev_item and item and isalnum(prev_item[-1]) and isalnum(item[0]): + if prev_item and item and is_id_continue(prev_item[-1]) and is_id_continue(item[0]): y.append(' ') y.append(item) prev_item = item diff --git a/lark/utils.py b/lark/utils.py index b0f0e22..498a12a 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -13,9 +13,13 @@ logger.addHandler(logging.StreamHandler()) # By default, we should not output any log messages logger.setLevel(logging.CRITICAL) -def isalnum(x): +def is_id_continue(x): + """ + Checks if all characters in `x` are alphanumeric characters (Unicode standard, so diactrics, Indian vowels, non-latin + numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier. + """ if len(x) != 1: - return all(isalnum(y) for y in x) + return all(is_id_continue(y) for y in x) return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc']