| @@ -7,7 +7,7 @@ from io import open | |||
| import pkgutil | |||
| from ast import literal_eval | |||
| from .utils import bfs, Py36, logger, classify_bool | |||
| from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start | |||
| from .lexer import Token, TerminalDef, PatternStr, PatternRE | |||
| from .parse_tree_builder import ParseTreeBuilder | |||
| @@ -332,10 +332,8 @@ class PrepareAnonTerminals(Transformer_InPlace): | |||
| try: | |||
| term_name = _TERMINAL_NAMES[value] | |||
| except KeyError: | |||
| if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set: | |||
| with suppress(UnicodeEncodeError): | |||
| value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names | |||
| term_name = value.upper() | |||
| if is_id_continue(value) and is_id_start(value[0]) and value.upper() not in self.term_set: | |||
| term_name = value.upper() | |||
| if term_name in self.term_set: | |||
| term_name = None | |||
| @@ -8,6 +8,7 @@ from .lexer import Token, PatternStr | |||
| from .grammar import Terminal, NonTerminal | |||
| from .tree_matcher import TreeMatcher, is_discarded_terminal | |||
| from .utils import is_id_continue | |||
| def is_iter_empty(i): | |||
| try: | |||
| @@ -56,10 +57,6 @@ class WriteTokensTransformer(Transformer_InPlace): | |||
| return to_write | |||
| def _isalnum(x): | |||
| # Categories defined here: https://www.python.org/dev/peps/pep-3131/ | |||
| return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'] | |||
| class Reconstructor(TreeMatcher): | |||
| """ | |||
| A Reconstructor that will, given a full parse Tree, generate source code. | |||
| @@ -97,7 +94,7 @@ class Reconstructor(TreeMatcher): | |||
| y = [] | |||
| prev_item = '' | |||
| for item in x: | |||
| if prev_item and item and _isalnum(prev_item[-1]) and _isalnum(item[0]): | |||
| if prev_item and item and is_id_continue(prev_item[-1]) and is_id_continue(item[0]): | |||
| y.append(' ') | |||
| y.append(item) | |||
| prev_item = item | |||
| @@ -1,3 +1,4 @@ | |||
| import unicodedata | |||
| import os | |||
| from functools import reduce | |||
| from collections import deque | |||
| @@ -15,6 +16,7 @@ Py36 = (sys.version_info[:2] >= (3, 6)) | |||
| NO_VALUE = object() | |||
| def classify(seq, key=None, value=None): | |||
| d = {} | |||
| for item in seq: | |||
| @@ -169,6 +171,29 @@ def get_regexp_width(expr): | |||
| ###} | |||
| _ID_START = 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc' | |||
| _ID_CONTINUE = _ID_START + ('Nd', 'Nl',) | |||
| def _test_unicode_category(s, categories): | |||
| if len(s) != 1: | |||
| return all(_test_unicode_category(char, categories) for char in s) | |||
| return s == '_' or unicodedata.category(s) in categories | |||
| def is_id_continue(s): | |||
| """ | |||
| Checks if all characters in `s` are alphanumeric characters (Unicode standard, so diacritics, indian vowels, non-latin | |||
| numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier. See PEP 3131 for details. | |||
| """ | |||
| return _test_unicode_category(s, _ID_CONTINUE) | |||
| def is_id_start(s): | |||
| """ | |||
| Checks if all characters in `s` are alphabetic characters (Unicode standard, so diacritics, indian vowels, non-latin | |||
| numbers, etc. all pass). Synonymous with a Python `ID_START` identifier. See PEP 3131 for details. | |||
| """ | |||
| return _test_unicode_category(s, _ID_START) | |||
| def dedup_list(l): | |||
| """Given a list (l) will removing duplicates from the list, | |||
| preserving the original order of the list. Assumes that | |||
| @@ -1,17 +1,22 @@ | |||
| # coding=utf-8 | |||
| import json | |||
| import sys | |||
| import unittest | |||
| from unittest import TestCase | |||
| from lark import Lark | |||
| from lark.reconstruct import Reconstructor | |||
| common = """ | |||
| %import common (WS_INLINE, NUMBER, WORD) | |||
| %ignore WS_INLINE | |||
| """ | |||
| def _remove_ws(s): | |||
| return s.replace(' ', '').replace('\n','') | |||
| return s.replace(' ', '').replace('\n', '') | |||
| class TestReconstructor(TestCase): | |||
| @@ -22,7 +27,6 @@ class TestReconstructor(TestCase): | |||
| self.assertEqual(_remove_ws(code), _remove_ws(new)) | |||
| def test_starred_rule(self): | |||
| g = """ | |||
| start: item* | |||
| item: NL | |||
| @@ -38,7 +42,6 @@ class TestReconstructor(TestCase): | |||
| self.assert_reconstruct(g, code) | |||
| def test_starred_group(self): | |||
| g = """ | |||
| start: (rule | NL)* | |||
| rule: WORD ":" NUMBER | |||
| @@ -52,7 +55,6 @@ class TestReconstructor(TestCase): | |||
| self.assert_reconstruct(g, code) | |||
| def test_alias(self): | |||
| g = """ | |||
| start: line* | |||
| line: NL | |||
| @@ -140,6 +142,43 @@ class TestReconstructor(TestCase): | |||
| new_json = Reconstructor(json_parser).reconstruct(tree) | |||
| self.assertEqual(json.loads(new_json), json.loads(test_json)) | |||
| @unittest.skipIf(sys.version_info < (3, 0), "Python 2 does not play well with Unicode.") | |||
| def test_switch_grammar_unicode_terminal(self): | |||
| """ | |||
| This test checks that a parse tree built with a grammar containing only ascii characters can be reconstructed | |||
| with a grammar that has unicode rules (or vice versa). The original bug assigned ANON terminals to unicode | |||
| keywords, which offsets the ANON terminal count in the unicode grammar and causes subsequent identical ANON | |||
| tokens (e.g., `+=`) to mis-match between the two grammars. | |||
| """ | |||
| g1 = """ | |||
| start: (NL | stmt)* | |||
| stmt: "keyword" var op var | |||
| !op: ("+=" | "-=" | "*=" | "/=") | |||
| var: WORD | |||
| NL: /(\\r?\\n)+\s*/ | |||
| """ + common | |||
| g2 = """ | |||
| start: (NL | stmt)* | |||
| stmt: "குறிப்பு" var op var | |||
| !op: ("+=" | "-=" | "*=" | "/=") | |||
| var: WORD | |||
| NL: /(\\r?\\n)+\s*/ | |||
| """ + common | |||
| code = """ | |||
| keyword x += y | |||
| """ | |||
| l1 = Lark(g1, parser='lalr') | |||
| l2 = Lark(g2, parser='lalr') | |||
| r = Reconstructor(l2) | |||
| tree = l1.parse(code) | |||
| code2 = r.reconstruct(tree) | |||
| assert l2.parse(code2) == tree | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||