From b01c283d47d7194959d057bbaf18a25640acc92a Mon Sep 17 00:00:00 2001 From: julienmalard Date: Tue, 10 Nov 2020 10:33:53 -0500 Subject: [PATCH 1/8] Failing test --- tests/test_reconstructor.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/test_reconstructor.py b/tests/test_reconstructor.py index 93c64fe..6196f4a 100644 --- a/tests/test_reconstructor.py +++ b/tests/test_reconstructor.py @@ -140,6 +140,43 @@ class TestReconstructor(TestCase): new_json = Reconstructor(json_parser).reconstruct(tree) self.assertEqual(json.loads(new_json), json.loads(test_json)) + def test_switch_grammar_unicode_terminal(self): + """ + This test checks that a parse tree built with a grammar containing only ascii characters can be reconstructed + with a grammar that has unicode rules (or vice versa). The original bug assigned ANON terminals to unicode + keywords, which offsets the ANON terminal count in the unicode grammar and causes subsequent identical ANON + tokens (e.g., `+=`) to mis-match between the two grammars. + """ + + g1 = """ + start: (NL | stmt)* + stmt: "keyword" var op var + !op: ("+=" | "-=" | "*=" | "/=") + var: WORD + NL: /(\\r?\\n)+\s*/ + """ + common + + g2 = """ + start: (NL | stmt)* + stmt: "குறிப்பு" var op var + !op: ("+=" | "-=" | "*=" | "/=") + var: WORD + NL: /(\\r?\\n)+\s*/ + """ + common + + code = """ + keyword x += y + """ + + l1 = Lark(g1, parser='lalr') + l2 = Lark(g2, parser='lalr') + r = Reconstructor(l2) + + tree = l1.parse(code) + code2 = r.reconstruct(tree) + assert l2.parse(code2) == tree + + if __name__ == '__main__': unittest.main() From 74c94bb3695453e13ba540587e6d308b5f9de827 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Tue, 10 Nov 2020 10:34:06 -0500 Subject: [PATCH 2/8] Tests now pass! --- lark/load_grammar.py | 6 +++--- lark/reconstruct.py | 7 ++----- lark/utils.py | 12 ++++++++++++ tests/test_nearley/nearley | 2 +- 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index eb0273c..dcf90dd 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -6,7 +6,7 @@ from copy import copy, deepcopy from io import open import pkgutil -from .utils import bfs, eval_escaping, Py36, logger, classify_bool +from .utils import bfs, eval_escaping, Py36, logger, classify_bool, isalnum, isalpha from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder @@ -328,9 +328,9 @@ class PrepareAnonTerminals(Transformer_InPlace): try: term_name = _TERMINAL_NAMES[value] except KeyError: - if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set: + if isalnum(value) and isalpha(value[0]) and value.upper() not in self.term_set: with suppress(UnicodeEncodeError): - value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names + value.upper().encode('utf8') # Why shouldn't we have unicode in our terminal names? term_name = value.upper() if term_name in self.term_set: diff --git a/lark/reconstruct.py b/lark/reconstruct.py index e7cff31..614fb5e 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -8,6 +8,7 @@ from .lexer import Token, PatternStr from .grammar import Terminal, NonTerminal from .tree_matcher import TreeMatcher, is_discarded_terminal +from .utils import isalnum def is_iter_empty(i): try: @@ -56,10 +57,6 @@ class WriteTokensTransformer(Transformer_InPlace): return to_write -def _isalnum(x): - # Categories defined here: https://www.python.org/dev/peps/pep-3131/ - return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'] - class Reconstructor(TreeMatcher): """ A Reconstructor that will, given a full parse Tree, generate source code. @@ -97,7 +94,7 @@ class Reconstructor(TreeMatcher): y = [] prev_item = '' for item in x: - if prev_item and item and _isalnum(prev_item[-1]) and _isalnum(item[0]): + if prev_item and item and isalnum(prev_item[-1]) and isalnum(item[0]): y.append(' ') y.append(item) prev_item = item diff --git a/lark/utils.py b/lark/utils.py index cfd4306..b0f0e22 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -1,4 +1,5 @@ import sys +import unicodedata import os from functools import reduce from ast import literal_eval @@ -12,6 +13,17 @@ logger.addHandler(logging.StreamHandler()) # By default, we should not output any log messages logger.setLevel(logging.CRITICAL) +def isalnum(x): + if len(x) != 1: + return all(isalnum(y) for y in x) + return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'] + + +def isalpha(x): + if len(x) != 1: + return all(isalpha(y) for y in x) + return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc'] + def classify(seq, key=None, value=None): d = {} diff --git a/tests/test_nearley/nearley b/tests/test_nearley/nearley index a46b374..cf8925f 160000 --- a/tests/test_nearley/nearley +++ b/tests/test_nearley/nearley @@ -1 +1 @@ -Subproject commit a46b37471db486db0f6e1ce6a2934fb238346b44 +Subproject commit cf8925f729bde741a3076c5856c0c0862bc7f5de From e5081420c68251cb740c7b5a1dd3118b5bf89cca Mon Sep 17 00:00:00 2001 From: julienmalard Date: Tue, 10 Nov 2020 10:47:02 -0500 Subject: [PATCH 3/8] Add encoding specification for Python 2 --- tests/test_reconstructor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_reconstructor.py b/tests/test_reconstructor.py index 6196f4a..45a6656 100644 --- a/tests/test_reconstructor.py +++ b/tests/test_reconstructor.py @@ -1,3 +1,5 @@ +# coding=utf-8 + import json import unittest from unittest import TestCase From 02c39a84dba0f05aeeef8b0d84a6e98d507a8387 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Tue, 10 Nov 2020 11:52:37 -0500 Subject: [PATCH 4/8] Skipping this test for Python 2 for now. --- tests/test_reconstructor.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_reconstructor.py b/tests/test_reconstructor.py index 45a6656..f132312 100644 --- a/tests/test_reconstructor.py +++ b/tests/test_reconstructor.py @@ -1,19 +1,22 @@ # coding=utf-8 import json +import sys import unittest from unittest import TestCase + from lark import Lark from lark.reconstruct import Reconstructor - common = """ %import common (WS_INLINE, NUMBER, WORD) %ignore WS_INLINE """ + def _remove_ws(s): - return s.replace(' ', '').replace('\n','') + return s.replace(' ', '').replace('\n', '') + class TestReconstructor(TestCase): @@ -24,7 +27,6 @@ class TestReconstructor(TestCase): self.assertEqual(_remove_ws(code), _remove_ws(new)) def test_starred_rule(self): - g = """ start: item* item: NL @@ -40,7 +42,6 @@ class TestReconstructor(TestCase): self.assert_reconstruct(g, code) def test_starred_group(self): - g = """ start: (rule | NL)* rule: WORD ":" NUMBER @@ -54,7 +55,6 @@ class TestReconstructor(TestCase): self.assert_reconstruct(g, code) def test_alias(self): - g = """ start: line* line: NL @@ -142,6 +142,7 @@ class TestReconstructor(TestCase): new_json = Reconstructor(json_parser).reconstruct(tree) self.assertEqual(json.loads(new_json), json.loads(test_json)) + @unittest.skipIf(sys.version_info < (3, 0), "Python 2 does not play well with Unicode.") def test_switch_grammar_unicode_terminal(self): """ This test checks that a parse tree built with a grammar containing only ascii characters can be reconstructed @@ -179,6 +180,5 @@ class TestReconstructor(TestCase): assert l2.parse(code2) == tree - if __name__ == '__main__': unittest.main() From 364f9ae3a567883cc0fd1b114fa443dd690239f6 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Tue, 10 Nov 2020 14:28:01 -0500 Subject: [PATCH 5/8] Response to code review --- lark/load_grammar.py | 8 +++----- lark/reconstruct.py | 4 ++-- lark/utils.py | 8 ++++++-- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index dcf90dd..e45053e 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -6,7 +6,7 @@ from copy import copy, deepcopy from io import open import pkgutil -from .utils import bfs, eval_escaping, Py36, logger, classify_bool, isalnum, isalpha +from .utils import bfs, eval_escaping, Py36, logger, classify_bool, is_id_continue, isalpha from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder @@ -328,10 +328,8 @@ class PrepareAnonTerminals(Transformer_InPlace): try: term_name = _TERMINAL_NAMES[value] except KeyError: - if isalnum(value) and isalpha(value[0]) and value.upper() not in self.term_set: - with suppress(UnicodeEncodeError): - value.upper().encode('utf8') # Why shouldn't we have unicode in our terminal names? - term_name = value.upper() + if is_id_continue(value) and isalpha(value[0]) and value.upper() not in self.term_set: + term_name = value.upper() if term_name in self.term_set: term_name = None diff --git a/lark/reconstruct.py b/lark/reconstruct.py index 614fb5e..2efc0ae 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -8,7 +8,7 @@ from .lexer import Token, PatternStr from .grammar import Terminal, NonTerminal from .tree_matcher import TreeMatcher, is_discarded_terminal -from .utils import isalnum +from .utils import is_id_continue def is_iter_empty(i): try: @@ -94,7 +94,7 @@ class Reconstructor(TreeMatcher): y = [] prev_item = '' for item in x: - if prev_item and item and isalnum(prev_item[-1]) and isalnum(item[0]): + if prev_item and item and is_id_continue(prev_item[-1]) and is_id_continue(item[0]): y.append(' ') y.append(item) prev_item = item diff --git a/lark/utils.py b/lark/utils.py index b0f0e22..498a12a 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -13,9 +13,13 @@ logger.addHandler(logging.StreamHandler()) # By default, we should not output any log messages logger.setLevel(logging.CRITICAL) -def isalnum(x): +def is_id_continue(x): + """ + Checks if all characters in `x` are alphanumeric characters (Unicode standard, so diactrics, Indian vowels, non-latin + numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier. + """ if len(x) != 1: - return all(isalnum(y) for y in x) + return all(is_id_continue(y) for y in x) return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'] From 1b2fc2bda44e67ea3a57ea010d8ededbdb593af9 Mon Sep 17 00:00:00 2001 From: julienmalard Date: Tue, 10 Nov 2020 16:05:32 -0500 Subject: [PATCH 6/8] Code review #2 --- lark/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lark/utils.py b/lark/utils.py index 498a12a..29fa514 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -16,7 +16,7 @@ logger.setLevel(logging.CRITICAL) def is_id_continue(x): """ Checks if all characters in `x` are alphanumeric characters (Unicode standard, so diactrics, Indian vowels, non-latin - numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier. + numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier. See PEP 3131 for details. """ if len(x) != 1: return all(is_id_continue(y) for y in x) @@ -24,6 +24,7 @@ def is_id_continue(x): def isalpha(x): + """See PEP 3131 for details.""" if len(x) != 1: return all(isalpha(y) for y in x) return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc'] From f8b0ca3ccc1f27bcaa9e7dedf6ae7374d677ac4f Mon Sep 17 00:00:00 2001 From: julienmalard Date: Tue, 10 Nov 2020 17:57:00 -0500 Subject: [PATCH 7/8] Code review 3 --- lark/load_grammar.py | 4 ++-- lark/utils.py | 13 ++++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index e45053e..ee38dc8 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -6,7 +6,7 @@ from copy import copy, deepcopy from io import open import pkgutil -from .utils import bfs, eval_escaping, Py36, logger, classify_bool, is_id_continue, isalpha +from .utils import bfs, eval_escaping, Py36, logger, classify_bool, is_id_continue, is_id_start from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder @@ -328,7 +328,7 @@ class PrepareAnonTerminals(Transformer_InPlace): try: term_name = _TERMINAL_NAMES[value] except KeyError: - if is_id_continue(value) and isalpha(value[0]) and value.upper() not in self.term_set: + if is_id_continue(value) and is_id_start(value[0]) and value.upper() not in self.term_set: term_name = value.upper() if term_name in self.term_set: diff --git a/lark/utils.py b/lark/utils.py index 29fa514..89498c6 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -20,14 +20,17 @@ def is_id_continue(x): """ if len(x) != 1: return all(is_id_continue(y) for y in x) - return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'] + return x == '_' or unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'] -def isalpha(x): - """See PEP 3131 for details.""" +def is_id_start(x): + """ + Checks if all characters in `x` are alphabetic characters (Unicode standard, so diactrics, Indian vowels, non-latin + numbers, etc. all pass). Synonymous with a Python `ID_START` identifier. See PEP 3131 for details. + """ if len(x) != 1: - return all(isalpha(y) for y in x) - return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc'] + return all(is_id_start(y) for y in x) + return x == '_' or unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc'] def classify(seq, key=None, value=None): From 89538498411f7675965ba248500d771f8a492fdd Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 29 Nov 2020 21:45:59 +0200 Subject: [PATCH 8/8] Allow unicode in terminal names --- lark/load_grammar.py | 4 +--- tests/test_nearley/nearley | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index af131c1..d663215 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -333,9 +333,7 @@ class PrepareAnonTerminals(Transformer_InPlace): term_name = _TERMINAL_NAMES[value] except KeyError: if is_id_continue(value) and is_id_start(value[0]) and value.upper() not in self.term_set: - with suppress(UnicodeEncodeError): - value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names - term_name = value.upper() + term_name = value.upper() if term_name in self.term_set: term_name = None diff --git a/tests/test_nearley/nearley b/tests/test_nearley/nearley index cf8925f..a46b374 160000 --- a/tests/test_nearley/nearley +++ b/tests/test_nearley/nearley @@ -1 +1 @@ -Subproject commit cf8925f729bde741a3076c5856c0c0862bc7f5de +Subproject commit a46b37471db486db0f6e1ce6a2934fb238346b44