@@ -7,7 +7,7 @@ from io import open | |||||
import pkgutil | import pkgutil | ||||
from ast import literal_eval | from ast import literal_eval | ||||
from .utils import bfs, Py36, logger, classify_bool | |||||
from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start | |||||
from .lexer import Token, TerminalDef, PatternStr, PatternRE | from .lexer import Token, TerminalDef, PatternStr, PatternRE | ||||
from .parse_tree_builder import ParseTreeBuilder | from .parse_tree_builder import ParseTreeBuilder | ||||
@@ -332,10 +332,8 @@ class PrepareAnonTerminals(Transformer_InPlace): | |||||
try: | try: | ||||
term_name = _TERMINAL_NAMES[value] | term_name = _TERMINAL_NAMES[value] | ||||
except KeyError: | except KeyError: | ||||
if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set: | |||||
with suppress(UnicodeEncodeError): | |||||
value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names | |||||
term_name = value.upper() | |||||
if is_id_continue(value) and is_id_start(value[0]) and value.upper() not in self.term_set: | |||||
term_name = value.upper() | |||||
if term_name in self.term_set: | if term_name in self.term_set: | ||||
term_name = None | term_name = None | ||||
@@ -8,6 +8,7 @@ from .lexer import Token, PatternStr | |||||
from .grammar import Terminal, NonTerminal | from .grammar import Terminal, NonTerminal | ||||
from .tree_matcher import TreeMatcher, is_discarded_terminal | from .tree_matcher import TreeMatcher, is_discarded_terminal | ||||
from .utils import is_id_continue | |||||
def is_iter_empty(i): | def is_iter_empty(i): | ||||
try: | try: | ||||
@@ -56,10 +57,6 @@ class WriteTokensTransformer(Transformer_InPlace): | |||||
return to_write | return to_write | ||||
def _isalnum(x): | |||||
# Categories defined here: https://www.python.org/dev/peps/pep-3131/ | |||||
return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'] | |||||
class Reconstructor(TreeMatcher): | class Reconstructor(TreeMatcher): | ||||
""" | """ | ||||
A Reconstructor that will, given a full parse Tree, generate source code. | A Reconstructor that will, given a full parse Tree, generate source code. | ||||
@@ -97,7 +94,7 @@ class Reconstructor(TreeMatcher): | |||||
y = [] | y = [] | ||||
prev_item = '' | prev_item = '' | ||||
for item in x: | for item in x: | ||||
if prev_item and item and _isalnum(prev_item[-1]) and _isalnum(item[0]): | |||||
if prev_item and item and is_id_continue(prev_item[-1]) and is_id_continue(item[0]): | |||||
y.append(' ') | y.append(' ') | ||||
y.append(item) | y.append(item) | ||||
prev_item = item | prev_item = item | ||||
@@ -1,3 +1,4 @@ | |||||
import unicodedata | |||||
import os | import os | ||||
from functools import reduce | from functools import reduce | ||||
from collections import deque | from collections import deque | ||||
@@ -15,6 +16,7 @@ Py36 = (sys.version_info[:2] >= (3, 6)) | |||||
NO_VALUE = object() | NO_VALUE = object() | ||||
def classify(seq, key=None, value=None): | def classify(seq, key=None, value=None): | ||||
d = {} | d = {} | ||||
for item in seq: | for item in seq: | ||||
@@ -169,6 +171,29 @@ def get_regexp_width(expr): | |||||
###} | ###} | ||||
_ID_START = 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc' | |||||
_ID_CONTINUE = _ID_START + ('Nd', 'Nl',) | |||||
def _test_unicode_category(s, categories): | |||||
if len(s) != 1: | |||||
return all(_test_unicode_category(char, categories) for char in s) | |||||
return s == '_' or unicodedata.category(s) in categories | |||||
def is_id_continue(s): | |||||
""" | |||||
Checks if all characters in `s` are alphanumeric characters (Unicode standard, so diacritics, indian vowels, non-latin | |||||
numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier. See PEP 3131 for details. | |||||
""" | |||||
return _test_unicode_category(s, _ID_CONTINUE) | |||||
def is_id_start(s): | |||||
""" | |||||
Checks if all characters in `s` are alphabetic characters (Unicode standard, so diacritics, indian vowels, non-latin | |||||
numbers, etc. all pass). Synonymous with a Python `ID_START` identifier. See PEP 3131 for details. | |||||
""" | |||||
return _test_unicode_category(s, _ID_START) | |||||
def dedup_list(l): | def dedup_list(l): | ||||
"""Given a list (l) will removing duplicates from the list, | """Given a list (l) will removing duplicates from the list, | ||||
preserving the original order of the list. Assumes that | preserving the original order of the list. Assumes that | ||||
@@ -1,17 +1,22 @@ | |||||
# coding=utf-8 | |||||
import json | import json | ||||
import sys | |||||
import unittest | import unittest | ||||
from unittest import TestCase | from unittest import TestCase | ||||
from lark import Lark | from lark import Lark | ||||
from lark.reconstruct import Reconstructor | from lark.reconstruct import Reconstructor | ||||
common = """ | common = """ | ||||
%import common (WS_INLINE, NUMBER, WORD) | %import common (WS_INLINE, NUMBER, WORD) | ||||
%ignore WS_INLINE | %ignore WS_INLINE | ||||
""" | """ | ||||
def _remove_ws(s): | def _remove_ws(s): | ||||
return s.replace(' ', '').replace('\n','') | |||||
return s.replace(' ', '').replace('\n', '') | |||||
class TestReconstructor(TestCase): | class TestReconstructor(TestCase): | ||||
@@ -22,7 +27,6 @@ class TestReconstructor(TestCase): | |||||
self.assertEqual(_remove_ws(code), _remove_ws(new)) | self.assertEqual(_remove_ws(code), _remove_ws(new)) | ||||
def test_starred_rule(self): | def test_starred_rule(self): | ||||
g = """ | g = """ | ||||
start: item* | start: item* | ||||
item: NL | item: NL | ||||
@@ -38,7 +42,6 @@ class TestReconstructor(TestCase): | |||||
self.assert_reconstruct(g, code) | self.assert_reconstruct(g, code) | ||||
def test_starred_group(self): | def test_starred_group(self): | ||||
g = """ | g = """ | ||||
start: (rule | NL)* | start: (rule | NL)* | ||||
rule: WORD ":" NUMBER | rule: WORD ":" NUMBER | ||||
@@ -52,7 +55,6 @@ class TestReconstructor(TestCase): | |||||
self.assert_reconstruct(g, code) | self.assert_reconstruct(g, code) | ||||
def test_alias(self): | def test_alias(self): | ||||
g = """ | g = """ | ||||
start: line* | start: line* | ||||
line: NL | line: NL | ||||
@@ -140,6 +142,43 @@ class TestReconstructor(TestCase): | |||||
new_json = Reconstructor(json_parser).reconstruct(tree) | new_json = Reconstructor(json_parser).reconstruct(tree) | ||||
self.assertEqual(json.loads(new_json), json.loads(test_json)) | self.assertEqual(json.loads(new_json), json.loads(test_json)) | ||||
@unittest.skipIf(sys.version_info < (3, 0), "Python 2 does not play well with Unicode.") | |||||
def test_switch_grammar_unicode_terminal(self): | |||||
""" | |||||
This test checks that a parse tree built with a grammar containing only ascii characters can be reconstructed | |||||
with a grammar that has unicode rules (or vice versa). The original bug assigned ANON terminals to unicode | |||||
keywords, which offsets the ANON terminal count in the unicode grammar and causes subsequent identical ANON | |||||
tokens (e.g., `+=`) to mis-match between the two grammars. | |||||
""" | |||||
g1 = """ | |||||
start: (NL | stmt)* | |||||
stmt: "keyword" var op var | |||||
!op: ("+=" | "-=" | "*=" | "/=") | |||||
var: WORD | |||||
NL: /(\\r?\\n)+\s*/ | |||||
""" + common | |||||
g2 = """ | |||||
start: (NL | stmt)* | |||||
stmt: "குறிப்பு" var op var | |||||
!op: ("+=" | "-=" | "*=" | "/=") | |||||
var: WORD | |||||
NL: /(\\r?\\n)+\s*/ | |||||
""" + common | |||||
code = """ | |||||
keyword x += y | |||||
""" | |||||
l1 = Lark(g1, parser='lalr') | |||||
l2 = Lark(g2, parser='lalr') | |||||
r = Reconstructor(l2) | |||||
tree = l1.parse(code) | |||||
code2 = r.reconstruct(tree) | |||||
assert l2.parse(code2) == tree | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
unittest.main() | unittest.main() |