@@ -7,7 +7,7 @@ from io import open | |||
import pkgutil | |||
from ast import literal_eval | |||
from .utils import bfs, Py36, logger, classify_bool | |||
from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start | |||
from .lexer import Token, TerminalDef, PatternStr, PatternRE | |||
from .parse_tree_builder import ParseTreeBuilder | |||
@@ -332,10 +332,8 @@ class PrepareAnonTerminals(Transformer_InPlace): | |||
try: | |||
term_name = _TERMINAL_NAMES[value] | |||
except KeyError: | |||
if value.isalnum() and value[0].isalpha() and value.upper() not in self.term_set: | |||
with suppress(UnicodeEncodeError): | |||
value.upper().encode('ascii') # Make sure we don't have unicode in our terminal names | |||
term_name = value.upper() | |||
if is_id_continue(value) and is_id_start(value[0]) and value.upper() not in self.term_set: | |||
term_name = value.upper() | |||
if term_name in self.term_set: | |||
term_name = None | |||
@@ -8,6 +8,7 @@ from .lexer import Token, PatternStr | |||
from .grammar import Terminal, NonTerminal | |||
from .tree_matcher import TreeMatcher, is_discarded_terminal | |||
from .utils import is_id_continue | |||
def is_iter_empty(i): | |||
try: | |||
@@ -56,10 +57,6 @@ class WriteTokensTransformer(Transformer_InPlace): | |||
return to_write | |||
def _isalnum(x): | |||
# Categories defined here: https://www.python.org/dev/peps/pep-3131/ | |||
return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'] | |||
class Reconstructor(TreeMatcher): | |||
""" | |||
A Reconstructor that will, given a full parse Tree, generate source code. | |||
@@ -97,7 +94,7 @@ class Reconstructor(TreeMatcher): | |||
y = [] | |||
prev_item = '' | |||
for item in x: | |||
if prev_item and item and _isalnum(prev_item[-1]) and _isalnum(item[0]): | |||
if prev_item and item and is_id_continue(prev_item[-1]) and is_id_continue(item[0]): | |||
y.append(' ') | |||
y.append(item) | |||
prev_item = item | |||
@@ -1,3 +1,4 @@ | |||
import unicodedata | |||
import os | |||
from functools import reduce | |||
from collections import deque | |||
@@ -15,6 +16,7 @@ Py36 = (sys.version_info[:2] >= (3, 6)) | |||
NO_VALUE = object() | |||
def classify(seq, key=None, value=None): | |||
d = {} | |||
for item in seq: | |||
@@ -169,6 +171,29 @@ def get_regexp_width(expr): | |||
###} | |||
_ID_START = 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Mc', 'Pc' | |||
_ID_CONTINUE = _ID_START + ('Nd', 'Nl',) | |||
def _test_unicode_category(s, categories): | |||
if len(s) != 1: | |||
return all(_test_unicode_category(char, categories) for char in s) | |||
return s == '_' or unicodedata.category(s) in categories | |||
def is_id_continue(s): | |||
""" | |||
Checks if all characters in `s` are alphanumeric characters (Unicode standard, so diacritics, indian vowels, non-latin | |||
numbers, etc. all pass). Synonymous with a Python `ID_CONTINUE` identifier. See PEP 3131 for details. | |||
""" | |||
return _test_unicode_category(s, _ID_CONTINUE) | |||
def is_id_start(s): | |||
""" | |||
Checks if all characters in `s` are alphabetic characters (Unicode standard, so diacritics, indian vowels, non-latin | |||
numbers, etc. all pass). Synonymous with a Python `ID_START` identifier. See PEP 3131 for details. | |||
""" | |||
return _test_unicode_category(s, _ID_START) | |||
def dedup_list(l): | |||
"""Given a list (l) will removing duplicates from the list, | |||
preserving the original order of the list. Assumes that | |||
@@ -1,17 +1,22 @@ | |||
# coding=utf-8 | |||
import json | |||
import sys | |||
import unittest | |||
from unittest import TestCase | |||
from lark import Lark | |||
from lark.reconstruct import Reconstructor | |||
common = """ | |||
%import common (WS_INLINE, NUMBER, WORD) | |||
%ignore WS_INLINE | |||
""" | |||
def _remove_ws(s): | |||
return s.replace(' ', '').replace('\n','') | |||
return s.replace(' ', '').replace('\n', '') | |||
class TestReconstructor(TestCase): | |||
@@ -22,7 +27,6 @@ class TestReconstructor(TestCase): | |||
self.assertEqual(_remove_ws(code), _remove_ws(new)) | |||
def test_starred_rule(self): | |||
g = """ | |||
start: item* | |||
item: NL | |||
@@ -38,7 +42,6 @@ class TestReconstructor(TestCase): | |||
self.assert_reconstruct(g, code) | |||
def test_starred_group(self): | |||
g = """ | |||
start: (rule | NL)* | |||
rule: WORD ":" NUMBER | |||
@@ -52,7 +55,6 @@ class TestReconstructor(TestCase): | |||
self.assert_reconstruct(g, code) | |||
def test_alias(self): | |||
g = """ | |||
start: line* | |||
line: NL | |||
@@ -140,6 +142,43 @@ class TestReconstructor(TestCase): | |||
new_json = Reconstructor(json_parser).reconstruct(tree) | |||
self.assertEqual(json.loads(new_json), json.loads(test_json)) | |||
@unittest.skipIf(sys.version_info < (3, 0), "Python 2 does not play well with Unicode.") | |||
def test_switch_grammar_unicode_terminal(self): | |||
""" | |||
This test checks that a parse tree built with a grammar containing only ascii characters can be reconstructed | |||
with a grammar that has unicode rules (or vice versa). The original bug assigned ANON terminals to unicode | |||
keywords, which offsets the ANON terminal count in the unicode grammar and causes subsequent identical ANON | |||
tokens (e.g., `+=`) to mis-match between the two grammars. | |||
""" | |||
g1 = """ | |||
start: (NL | stmt)* | |||
stmt: "keyword" var op var | |||
!op: ("+=" | "-=" | "*=" | "/=") | |||
var: WORD | |||
NL: /(\\r?\\n)+\s*/ | |||
""" + common | |||
g2 = """ | |||
start: (NL | stmt)* | |||
stmt: "குறிப்பு" var op var | |||
!op: ("+=" | "-=" | "*=" | "/=") | |||
var: WORD | |||
NL: /(\\r?\\n)+\s*/ | |||
""" + common | |||
code = """ | |||
keyword x += y | |||
""" | |||
l1 = Lark(g1, parser='lalr') | |||
l2 = Lark(g2, parser='lalr') | |||
r = Reconstructor(l2) | |||
tree = l1.parse(code) | |||
code2 = r.reconstruct(tree) | |||
assert l2.parse(code2) == tree | |||
if __name__ == '__main__': | |||
unittest.main() |